PhantomJS / Javascript: download web page and write to text file - phantomjs

Below code is download web page it work fine but i want to save i put code for write text file honestly i have no idea how i can do this to save file
var url = 'http://stackoverflow.com';
var page = require('webpage').create();
page.open(url, function(status) {
if (status === 'success') {
var html = page.evaluate(function() {
return document.documentElement.outerHTML;
});
console.log(html);
}
var fs = require('fs');
try {
fs.write("C:\phantomjs\\qhxpZ.txt", "Message to be written to the file", 'w');
} catch(e) {
console.log(e);
}
phantom.exit();
});

So, for completeness, the solution to the issue at hand should look something like:
var url = 'http://stackoverflow.com';
var fs = require('fs');
var page = require('webpage').create();
page.open(url, function(status) {
if (status === 'success') {
var html = page.evaluate(function() {
return document.documentElement.outerHTML;
});
try {
fs.write("C:\\phantomjs\\qhxpZ.txt", html, 'w');
} catch(e) {
console.log(e);
}
}
phantom.exit();
});

just replace "Message to be written to the file" with html and file will be saved.
fs.write("C:\phantomjs\\qhxpZ.txt", "Message to be written to the file", 'w');

Related

Can TinyMCE convert image pasted as data to an upload?

I am using TinyMCE 5.7 and it handles uploading images well. However when an image is pasted from the clipboard (ex: Snipping Tool) it gets pasted as data which is not desired. I can use the setting paste_data_images to block pasting data images but I would prefer that it convert the data into an upload request like normal image upload process. Is there a way to intercept the paste and do the upload? I am using both the image and paste plugins. Thanks
Eventually I figured out how to write my own paste function. First, in the TinyMCE config:
setup: function (editor) {
editor.on('paste', function (e) {
var imageBlob = retrieveImageFromClipboardAsBlob(e);
if (!imageBlob) {
return;
}
e.preventDefault();
uploadFile(imageBlob, function (response) {
if ('location' in response) {
if (editor) {
// console.log('upload completed', response);
editor.insertContent('<img src="' + response.location + '" />');
} else {
console.log('Tinymce editor not found!');
}
}
});
});
}
Then the routine to decode the pasted info:
function retrieveImageFromClipboardAsBlob(pasteEvent) {
if (pasteEvent.clipboardData === false) {
return false;
}
var items = pasteEvent.clipboardData.items;
if (items === undefined) {
return false;
}
for (var i = 0; i < items.length; i++) {
// Only paste if image is only choice
if (items[i].type.indexOf("image") === -1) {
return false;
}
// Retrieve image on clipboard as blob
var blob = items[i].getAsFile();
// load image if there is a pasted image
if (blob !== null) {
const reader = new FileReader();
reader.onload = function(e) {
// console.log('result', e.target.result);
};
reader.readAsDataURL(blob);
return blob;
}
}
return false;
}
and a routine to upload the file
function uploadFile(file, callback) {
var xhr = new XMLHttpRequest();
xhr.upload.onprogress = function (e) {
var percentComplete = (e.loaded / e.total) * 100;
console.log("Uploaded: " + percentComplete + "%");
};
xhr.onload = function () {
if (xhr.status !== 200) {
alert("Error! Upload failed " + xhr.response);
}
if (callback) {
callback(JSON.parse(xhr.response));
}
};
xhr.onerror = function () {
alert("Error! Upload failed. Can not connect to server.");
};
xhr.open("POST", "/upload/tinymce", true);
var data = new FormData();
data.append('file', file);
xhr.send(data);
}

Have a puppeteer generated PDF pass accessibility reports

I'm building PDFs using Puppeteer, the resulting PDF looks nice but it is failing PDF Accessibility reports.
The main issues have been the title of the PDF, and the Language of the PDF.
I have tried setting both via EXIF values (Title, Language), the title does display in certain cases but still fails Acrobat Pro's accessibility check report.
I have used another accessibility check report ( http://checkers.eiii.eu/en/pdfcheck/ ) and there the title is set successfully but not the language.
I have used --export-tagged-pdf as a launch parameter which fixed many other issues.
Would anyone have an idea how I could pass the accessibility report please? Mostly the language parameter. I'm using Node.js to generate the PDFs, even if there is another library to edit the PDF after the fact that would be really helpful, I wasn't able to figure that out.
Facing the same problem I managed to get all the required meta data and XMP data except PDF-UA identifier. I used the JS lib "pdf-lib" (https://pdf-lib.js.org/docs/api/classes/pdfdocument) to set the meta data and exiftool-vendored to inject the XMP shema data.
const pdfLib = require('pdf-lib');
const exiftool = require("exiftool-vendored").exiftool
const fs = require('fs');
const distexiftool = require('dist-exiftool');
const pdfData = await fs.readFile('your-pdf-document.pdf');
const pdfDoc = await pdfLib.PDFDocument.load(pdfData);
const nowDate = new Date();
const meta_creator = "The author";
const meta_author = "The author";
const meta_producer = "The producer";
const meta_title = "Your PDF title";
const meta_subject = "Your PDF subject";
const meta_creadate = `${nowDate.getFullYear()}-${nowDate.getMonth()+1}-${nowDate.getDate()}`;
const meta_keywords = ["keyword1", "keyword2", "keyword3", "keyword4"];
// Implement PDF Title
pdfDoc.setSubject(meta_subject);
// Implement required "DisplayDocTitle" pdf var
pdfDoc.setTitle(meta_title, {
showInWindowTitleBar: true,
updateMetadata: true
});
// Implement PDF language
pdfDoc.setLanguage("en-EN");
// Save file in order exiftool can load it
const pdfBytes = await pdfDoc.save();
await fs.promises.writeFile("your-pdf-document.pdf", pdfBytes);
// We use "distexiftool" to get the TAGS from PDF/UA well formed XMP file "pdfUA-ID.xmp" and assign data to "your-pdf-document.pdf"
execFile(distexiftool, ["-j","-xmp<=pdfUA-ID.xmp", "your-pdf-document.pdf"], (error, stdout, stderr) => {
if (error) {
console.error(`exec error: ${error}`);
return;
}
afterTagsOperation()
});
async function afterTagsOperation(){
// Open the file and write XMP tags with exiftool
await exiftool.write("your-pdf-document.pdf", { 'xmp:Author': meta_author });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Creator': meta_creator });
await exiftool.write("your-pdf-document.pdf", { 'xmp:CreateDate': meta_creadate });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Producer': meta_producer });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Title': meta_title });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Subject': meta_subject });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Keywords': meta_keywords });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Trapped': 'false' });
await exiftool.write("your-pdf-document.pdf", { 'xmp:DocumentID': `uuid:${nowDate.getTime()}` });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Title': meta_title });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Subject': meta_subject });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Keywords': meta_keywords });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Trapped': 'false' });
await exiftool.write("your-pdf-document.pdf", { 'xmp:Identifier': nowDate.getTime() });
await exiftool.write("your-pdf-document.pdf", { 'xmp:PDFVersion': `v${nowDate.getTime()}` });
await exiftool.write("your-pdf-document.pdf", { 'xmp-xmpMM:DocumentID': `uuid:${nowDate.getTime()}` });
await exiftool.write("your-pdf-document.pdf", { 'xmp-dc:format': `application/pdf` });
await exiftool.write("your-pdf-document.pdf", { 'xmp-dc:title': meta_title });
// We save the file
const pdfBytes = await pdfDoc.save();
await fs.promises.writeFile("your-pdf-document.pdf", pdfBytes);
}

PhantomJS getJSON unable to get a response

I'm trying to use $.getJSON inside PhantomJS but impossible to get the result of it. Any solution? I can not simply load or includeJs directly. The page has to be called from the same domain.
So I want to open a page and do the call from there.
Here is my current code which is not working:
var jqueryUrl = "https://code.jquery.com/jquery-latest.min.js";
page.open("http://www.example.com/", function(status) {
if (status === "success") {
page.includeJs(jqueryUrl, function() {
var result = page.evaluate(function() {
$.getJSON('http://www.example.com/someJson', function(data) {
return data;
});
});
console.log(result);
phantom.exit();
});
} else {
phantom.exit(1);
}
});
Thanks for any help!
You need to use page.onCallback with a combination with window.callPhantom because you are making an HTTP request in phantomjs context and the result needs to be returned only after the request is done.
I haven't tested exactly this code, but it should be something like this:
var jqueryUrl = "https://code.jquery.com/jquery-latest.min.js";
page.open("http://www.example.com/", function(status) {
if (status === "success") {
page.onCallback(function(data) {
// got the data!
console.log(data);
phantom.exit();
});
page.includeJs(jqueryUrl, function() {
page.evaluate(function() {
$.getJSON('http://www.example.com/someJson', window.callPhantom);
});
});
} else {
phantom.exit(1);
}
});

PhantomJs Injecting jQuery in different pages

I have a PhantomJs script in which I create a new wepage, inject jQuery into it and scrape a list of URL from it. After that I call a function passing the list of URL and create a new webpage for each one and try to recover certain information from it
var pageGlobal = require('webpage');
function createPage(){
var page = pageGlobal.create();
page.onAlert = function(msg) {
console.log(msg);
};
return page;
}
var page=createPage();
page.open('http://www.example.com/', function(status){
if ( status === "success" ) {
page.injectJs('jquery-1.6.1.min.js');
var urlList=page.evaluate(
function(){
var urlList=[];
window.console.log = function(msg) { alert(msg) };
$("td.row1>a").each(function(index, link) {
var link=$(link).attr('href');
urlList.push(link);
});
return urlList;
});
processUrlList(urlList);
}
});
function processUrlList(urlList){
for(i=0;i<urlList.length;i++){
var currentPage=createPage();
currentPage.open("http://www.example.com"+urlList[i], function(status){
if ( status === "success" ) {
if(currentPage.injectJs('jquery-1.6.1.min.js')===false){
console.log("Error en la inyeccion");
}
currentPage.evaluate(function() {
window.console.log = function(msg) { alert(msg) };
console.log("Evaluating");
$("showAdText").each(function(index, link) {
//Capture information about the entity in this URL
})
});
}
});
}
}
The problem is in the processUrlList function the injection of jQuery always fail returning false. Would it be a problem to create two or more page objects instead of reusing only one? What could be happening here?

How to use phantomjs?

I would like to learn phantomjs, but i can`t find good tutorial. I have 2 questions:
where is problem in following code (need to capture label of button and write to file):
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
page.open("http://vk.com", function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
var str = $("#quick_login_button").text();
f = fs.open("ololo.txt", "w");
f.writeLine(str);
f.close();
console.log("done");
});
phantom.exit();
});
}
});
what tutorial in phantomjs you can advice to me? (not from official site)
Because execution is sandboxed, the web page has no access to the phantom objects.
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
page.open("http://vk.com", function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
return $("#quick_login_button").text();
});
f = fs.open("ololo.txt", "w");
f.writeLine(str);
f.close();
console.log("done");
phantom.exit();
});
}
});
PhantomJS comes with a lot of included examples. Take a look here.