I would like to learn phantomjs, but i can`t find good tutorial. I have 2 questions:
where is problem in following code (need to capture label of button and write to file):
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
page.open("http://vk.com", function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
var str = $("#quick_login_button").text();
f = fs.open("ololo.txt", "w");
f.writeLine(str);
f.close();
console.log("done");
});
phantom.exit();
});
}
});
what tutorial in phantomjs you can advice to me? (not from official site)
Because execution is sandboxed, the web page has no access to the phantom objects.
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
page.open("http://vk.com", function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
return $("#quick_login_button").text();
});
f = fs.open("ololo.txt", "w");
f.writeLine(str);
f.close();
console.log("done");
phantom.exit();
});
}
});
PhantomJS comes with a lot of included examples. Take a look here.
Related
infinity loading when try to scrap title article
i was trying to scrap title articles with cheerio in node js but i could not make it happen
infinity loading in google chrome and no content . here is my controller.js file that i trying to use cheerio to scrap title articles with cheerio and show handlesbars view engine
but view engie is working and everything is fine . but for scrap infinity loading in chrome with no error
classname that i use for this website is .c-entry-box--compact__title
maybe i am wrong . but i can not figure out
var express = require("express");
var router = express.Router();
var path = require("path");
var request = require("request");
var cheerio = require("cheerio");
var Comment = require("../models/Comment.js");
var Article = require("../models/Article.js");
router.get("/", function(req, res) {
res.redirect("/articles");
});
router.get("/scrape", function(req, res) {
request("http://www.theverge.com", function(error, response, html) {
var $ = cheerio.load(html);
var titlesArray = [];
$(".c-entry-box--compact__title").each(function(i, element) {
var result = {};
result.title = $(this)
.children("a")
.text();
result.link = $(this)
.children("a")
.attr("href");
if (result.title !== "" && result.link !== "") {
if (titlesArray.indexOf(result.title) == -1) {
titlesArray.push(result.title);
Article.count({ title: result.title }, function(err, test) {
if (test === 0) {
var entry = new Article(result);
entry.save(function(err, doc) {
if (err) {
console.log(err);
} else {
console.log(doc);
}
});
}
});
} else {
console.log("Article already exists.");
}
} else {
console.log("Not saved to DB, missing data");
}
});
res.redirect("/");
});
});
router.get("/articles", function(req, res) {
Article.find()
.sort({ _id: -1 })
.exec(function(err, doc) {
if (err) {
console.log(err);
} else {
var artcl = { article: doc };
res.render("index", artcl);
}
});
});
router.get("/articles-json", function(req, res) {
Article.find({}, function(err, doc) {
if (err) {
console.log(err);
} else {
res.json(doc);
}
});
});
router.get("/clearAll", function(req, res) {
Article.remove({}, function(err, doc) {
if (err) {
console.log(err);
} else {
console.log("removed all articles");
}
});
res.redirect("/articles-json");
});
router.get("/readArticle/:id", function(req, res) {
var articleId = req.params.id;
var hbsObj = {
article: [],
body: []
};
Article.findOne({ _id: articleId })
.populate("comment")
.exec(function(err, doc) {
if (err) {
console.log("Error: " + err);
} else {
hbsObj.article = doc;
var link = doc.link;
request(link, function(error, response, html) {
var $ = cheerio.load(html);
$(".l-col__main").each(function(i, element) {
hbsObj.body = $(this)
.children(".c-entry-content")
.children("p")
.text();
res.render("article", hbsObj);
return false;
});
});
}
});
});
router.post("/comment/:id", function(req, res) {
var user = req.body.name;
var content = req.body.comment;
var articleId = req.params.id;
var commentObj = {
name: user,
body: content
};
var newComment = new Comment(commentObj);
newComment.save(function(err, doc) {
if (err) {
console.log(err);
} else {
console.log(doc._id);
console.log(articleId);
Article.findOneAndUpdate(
{ _id: req.params.id },
{ $push: { comment: doc._id } },
{ new: true }
).exec(function(err, doc) {
if (err) {
console.log(err);
} else {
res.redirect("/readArticle/" + articleId);
}
});
}
});
});
module.exports = router;
The http://www.theverge.com Add content dynamically by scrolling.
This is example how to get the title by puppeteer
const puppeteer = require("puppeteer");
const getTitle = async () => {
try {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await page.goto('https://www.theverge.com');
await page.setViewport({
width:1920,
height:1080
});
// scroll down end of page
await page.evaluate(() => {
window.scrollTo(0, window.document.body.scrollHeight);
});
await page.waitForNavigation({ waitUntil: 'networkidle0' }), // (0 network connections for 500ms)
// get the title
titles = await page.evaluate(() => {
const textSelector = 'div.inline.pr-4.font-bold'
texts = Array.from(document.querySelectorAll(textSelector), row => row.innerText.trim() );
return texts;
});
await browser.close();
return Promise.resolve(titles);
} catch (error) {
return Promise.reject(error);
}
}
getTitle()
.then((titles) => {
console.log(titles); // first news search
})
This is result
[
'Is an upgraded M2 Ultra enough for a new Mac Pro and the Mac Studio?',
'Here’s the official trailer for Christopher Nolan’s next IMAX-filmed epic, Oppenheimer.',
'Sam Bankman-Fried’s ready to surrender himself to the US for extradition.',
'Who knew the thumb drive had such a contentious origin story?',
'But how many pebbles do you have in a jar?',
'This way for The Way of Water.',
'Netflix is taking Blockbuster behind the woodshed again.',
'I can’t escape the year-end wrap-ups.',
'The clock’s ticking if you want to get your gifts on time.',
'Want solar panels on your California home? Now might be the time.',
'Twitter Spaces has returned.',
'Apple’s facing another accusation of breaking labor laws.',
'Every game should have this feature.',
'Google’s working on simplifying smart home control on the wrist.',
'Apple could open up iOS, and the feds finally make a case against SBF.',
'I’m not the first, and I won’t be the last... but I do feel early.',
'For what it’s worth, Avatar: The Way of Water’s a good looking movie.',
'You may not want to upgrade to Apple’s new Home architecture.'
]
I have a PhantomJs script in which I create a new wepage, inject jQuery into it and scrape a list of URL from it. After that I call a function passing the list of URL and create a new webpage for each one and try to recover certain information from it
var pageGlobal = require('webpage');
function createPage(){
var page = pageGlobal.create();
page.onAlert = function(msg) {
console.log(msg);
};
return page;
}
var page=createPage();
page.open('http://www.example.com/', function(status){
if ( status === "success" ) {
page.injectJs('jquery-1.6.1.min.js');
var urlList=page.evaluate(
function(){
var urlList=[];
window.console.log = function(msg) { alert(msg) };
$("td.row1>a").each(function(index, link) {
var link=$(link).attr('href');
urlList.push(link);
});
return urlList;
});
processUrlList(urlList);
}
});
function processUrlList(urlList){
for(i=0;i<urlList.length;i++){
var currentPage=createPage();
currentPage.open("http://www.example.com"+urlList[i], function(status){
if ( status === "success" ) {
if(currentPage.injectJs('jquery-1.6.1.min.js')===false){
console.log("Error en la inyeccion");
}
currentPage.evaluate(function() {
window.console.log = function(msg) { alert(msg) };
console.log("Evaluating");
$("showAdText").each(function(index, link) {
//Capture information about the entity in this URL
})
});
}
});
}
}
The problem is in the processUrlList function the injection of jQuery always fail returning false. Would it be a problem to create two or more page objects instead of reusing only one? What could be happening here?
I have a website with a login form. If a user is not logged and tries to access a internal page it will be redirected to the default page. For instance if I try to access
http://siteURL.PhantomPrint.aspx I will be redirected to http://siteURL/Default.aspx?ReturnUrl=PhantomPrint.aspx. And after login an automatic redirect will take place to the page.
After the redirect I want to render the page with Phantomjs and save it as pdf. The problem is that the rendering takes place before page load is finished and I can properly render the page only if I use timeouts. In this case, if the page loading takes longer than normal the resulted pdf is not the proper one.
Below you can find the java script code:
var page = require('webpage').create();
var index = 0,
page.onConsoleMessage = function (msg) {
console.log(msg);
};
var steps = [
function () {
//Load Login Page
page.open("http://siteURL.PhantomPrint.aspx", function () {
//Enter Credentials
page.evaluate(function () {
console.log("filling inputs");
var usernameInput = document.getElementById("txtUsername");
usernameInput.value = "user";
var passwordInput = document.getElementById("txtPassword");
passwordInput.value = "password";
var loginButton = document.getElementById("btnLogin");
loginButton.click();
console.log("login button was submitted");
});
});
},
function () {
// page.onLoadFinished = function () {
// Render the page to pdf
page.render('example.png');
phantom.exit();
console.log("rendering finished");
//});
}
];
interval = setInterval(function () {
if (!loadInProgress && typeof steps[testindex] == "function") {
console.log("step " + (testindex + 1));
steps[testindex]();
testindex++;
}
if (typeof steps[testindex] != "function") {
console.log("test complete!");
phantom.exit();
}
}, 1000);
Any suggestions on how I can assure that rendering is done only after the redirected page is finishing loading are welcomed.
It looks like you want to process navigation steps. You would need to use page.onNavigationRequested to pick up if a page load/redirect was issued. This will be likely hard to maintain. You would also have to discard the idea of using a step array with setInterval.
Another possibility would be to specifically wait for some selector that is present in the target page using waitFor, but then again, this would make the use of setInterval impossible.
CasperJS is actually built on top of PhantomJS and uses steps to navigate the site. When you use any of the then* functions it will automatically pick up a page load and wait for page load finish until executing the callback.
var casper = require('casper').create();
casper.on("remote.message", function (msg) {
console.log(msg);
});
casper.start("http://siteURL/PhantomPrint.aspx", function () {
//Enter Credentials
this.evaluate(function () {
console.log("filling inputs");
var usernameInput = document.getElementById("txtUsername");
usernameInput.value = "user";
var passwordInput = document.getElementById("txtPassword");
passwordInput.value = "password";
});
this.click("#btnLogin");
this.echo("login button was submitted");
});
casper.then(function () {
this.capture('example.png');
});
casper.run();
This can be made even smaller by using casper.fillSelectors.
After more research I found a solution, see below code.
var loadInProgress = false;
page.onLoadStarted = function () {
loadInProgress = true;
console.log("load started");
};
page.onLoadFinished = function () {
loadInProgress = false;
console.log("load finished");
};
interval = setInterval(function () {
if (!loadInProgress && typeof steps[testindex] == "function") {
console.log("step " + (testindex + 1));
steps[testindex]();
testindex++;
}
if (typeof steps[testindex] != "function") {
console.log("test complete!");
phantom.exit();
}
}, 100)
But I would like to know if there is no other solution which would not involve recursively function calling.
Below code is download web page it work fine but i want to save i put code for write text file honestly i have no idea how i can do this to save file
var url = 'http://stackoverflow.com';
var page = require('webpage').create();
page.open(url, function(status) {
if (status === 'success') {
var html = page.evaluate(function() {
return document.documentElement.outerHTML;
});
console.log(html);
}
var fs = require('fs');
try {
fs.write("C:\phantomjs\\qhxpZ.txt", "Message to be written to the file", 'w');
} catch(e) {
console.log(e);
}
phantom.exit();
});
So, for completeness, the solution to the issue at hand should look something like:
var url = 'http://stackoverflow.com';
var fs = require('fs');
var page = require('webpage').create();
page.open(url, function(status) {
if (status === 'success') {
var html = page.evaluate(function() {
return document.documentElement.outerHTML;
});
try {
fs.write("C:\\phantomjs\\qhxpZ.txt", html, 'w');
} catch(e) {
console.log(e);
}
}
phantom.exit();
});
just replace "Message to be written to the file" with html and file will be saved.
fs.write("C:\phantomjs\\qhxpZ.txt", "Message to be written to the file", 'w');
I am trying to extract the table in the following website
http://projects.wsj.com/jettracker/#a=HYA&d=BED&e=2011-01-01&m=indv&o=EMC+CORP.&p=0&s=2007-01-01&sort=d&t=N125TM,N424TM,N448TM,N67TM,N866TM&v=table
The url changes to page 2, but the same table appears in either the screenshots or the saved .doc files, regardless of the pause. On the website when you click page 2 the table automatically updates. Any help will be greatly appreciated.
Below is my code
var casper = require('casper').create();
var fs = require('fs');
casper.start('http://projects.wsj.com/jettracker/#a=HYA&d=BED&e=2011-01- 01&m=indv&o=EMC+CORP.&p=0&s=2007-01-01&sort=d&t=N125TM,N424TM,N448TM,N67TM,N866TM&v=table', function() {
this.capture("crap0" + ".png");
var firstRow = this.evaluate(function () {
var elements = __utils__.getElementsByXPath('//*[#id="table_results"]/table');
return [].map.call(elements, function(element) {
return element.innerText;
});
});
fs.write('pook.doc', firstRow, 'w');
});
casper.then(function() {
//Click on 1st result link
this.click({
type: 'xpath',
path: '//*[#id="results-pagination"]/div/a[3]'
});
// var url ='http://projects.wsj.com/jettracker/#a=HYA&d=BED&e=2011-01-01&m=indv&o=EMC+CORP.&p=1&s=2007-01-01&sort=d&t=N125TM,N424TM,N448TM,N67TM,N866TM&v=table'
//this.open(url);
this.waitFor(function check() {
return (this.getCurrentUrl() === 'http://projects.wsj.com/jettracker/#a=HYA&d=BED&e=2011-01-01&m=indv&o=EMC+CORP.&p=1&s=2007-01-01&sort=d&t=N125TM,N424TM,N448TM,N67TM,N866TM&v=table');
},
function then() { // step to execute when check() is ok
this.echo('Navigated to page 2', 'INFO');
},
function timeout() { // step to execute if check has failed
this.echo('Failed to navigate to page 2', 'ERROR');
});
this.capture("crap" + ".png");
this.wait(20000, function() {
this.echo("I've waited for 20 seconds.");
});
var firstRow2 = this.evaluate(function () {
var elements2 = __utils__.getElementsByXPath('//*[#id="table_results"]/table');
return [].map.call(elements2, function(element2) {
return element2.innerText;
});
});
fs.write('poop.doc', firstRow2, 'w');
});
casper.run();
You were close! Just remember, You must use waitFor() function only when your page context includes dynamic content. This isn't the case.
Try:
var casper = require('casper').create();
var fs = require('fs');
casper.start('http://projects.wsj.com/jettracker/#a=HYA&d=BED&e=2011-01-01&m=indv&o=EMC+CORP.&p=0&s=2007-01-01&sort=d&t=N125TM,N424TM,N448TM,N67TM,N866TM&v=table', function() {
this.capture("crap0" + ".png");
var firstRow = this.evaluate(function () {
var elements = __utils__.getElementsByXPath('//*[#id="table_results"]/table');
return [].map.call(elements, function(element) {
return element.innerText;
});
});
fs.write('pook.doc', firstRow, 'w');
});
casper.then(function() {
//Click on 1st result link
this.click({
type: 'xpath',
path: '//*[#id="results-pagination"]/div/a[3]'
});
casper.then(function() {
this.capture("crap" + ".png");
var firstRow2 = this.evaluate(function () {
var elements2 = __utils__.getElementsByXPath('//*[#id="table_results"]/table');
return [].map.call(elements2, function(element2) {
return element2.innerText;
});
});
fs.write('poop.doc', firstRow2, 'w');
});
});
casper.run();