I have a phantomJS script
var page = require('webpage').create();
var system = require('system');
page.settings.userAgent = 'SpecialAgent';
var i = 1;
var url = 'http://www.google.cz/?test=' + i;
console.log( "url='" + url + "'" );
page.open(url, function (status) {
if (status !== 'success') {
console.log( 'Unable to access network' );
} else {
console.log( 'Opened ok' );
phantom.exit();
}
});
which works fine, but when I wrap the page.open in for loop as
var page = require('webpage').create();
var system = require('system');
page.settings.userAgent = 'SpecialAgent';
//var i = 1;
for ( var i = 1; i <= 2; ++i ) {
var url = 'http://www.google.cz/?test=' + i;
console.log( "url='" + url + "'" );
page.open(url, function (status) {
if (status !== 'success') {
console.log( 'Unable to access network' );
} else {
console.log( 'Opened ok' );
phantom.exit();
}
});
}
I'm getting
Unable to access network
I think it is because it is executed in parallel. Am I correct? (phantom.exit() is in wrong place probably) How can I wait for processing page.open and then execute next iteration?
The problem is that PhantomJS version < 1.9.8 uses SSLv3 by default, but because of the POODLE vulnerability most webservers have disabled SSLv3 support, so you need to explicitly add the --ssl-protocol=tlsv1 (or) --ssl-protocol=any commandline option.
When you run your script using phantomjs, add the following command line options
phantomjs --ssl-protocol=any yourscript.js
If you're still getting an error it may be due to SSL certificate errors, using --ignore-ssl-errors=yes option should proceed to load the page as it will ignore all ssl errors, including malicious ones.
phantomjs --ssl-protocol=any --ignore-ssl-errors=yes yourscript.js
Related
Phantomjs has these two really handy callbacks onLoadStarted and onLoadFinished which allow you to essentially pause execution while the page is loading. But I've been searching and I can't find an equivalent for if you click() a submit button or hyperlink. A similar page load happens but onLoadStarted doesn't get called for this event I guess because there isn't an explicit page.open() that happens. I'm trying to figure out a clean way to suspend execution while this load takes place.
One solution is obviously nested setTimeout's but I'd like to avoid this scenario because it's hacky and relies on trial and error instead of something reliable and more robust like testing against something or waiting for an event.
Is there a specific callback for this kind of page load that I missed? Or maybe there's some kind of generic code pattern that can deal with this sort of thing?
EDIT:
I still haven't figured out how to get it to pause. Here's the code that doesn't call the onLoadStarted() function when I call the click() command:
var loadInProgress = false;
page.onLoadStarted = function() {
loadInProgress = true;
console.log("load started");
};
page.onLoadFinished = function() {
loadInProgress = false;
console.log("load finished");
};
page.open(loginPage.url, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
fs.write(filePath + errorState, 1, 'w');
phantom.exit();
} else {
page.evaluate(function (loginPage, credentials) {
console.log('inside loginPage evaluate function...\n')
document.querySelector('input[id=' + loginPage.userId + ']').value = credentials.username;
document.querySelector('input[id=' + loginPage.passId + ']').value = credentials.password;
document.querySelector('input[id=' + loginPage.submitId + ']').click();
//var aTags = document.getElementsByTagName('a')
//aTags[1].click();
}, loginPage, credentials);
page.render(renderPath + 'postLogin.png');
console.log('rendered post-login');
I double checked that the id is correct. The page.render() will show that the information is submitted, but only if I put it in a setTimeout(), otherwise it renders it immediately and I only see the credentials inputted, before the page redirect. Maybe I'm missing something else?
I think the onLoadStarted and onLoadFinished functions are everything you need. Take for example the following script:
var page = require('webpage').create();
page.onResourceReceived = function(response) {
if (response.stage !== "end") return;
console.log('Response (#' + response.id + ', stage "' + response.stage + '"): ' + response.url);
};
page.onResourceRequested = function(requestData, networkRequest) {
console.log('Request (#' + requestData.id + '): ' + requestData.url);
};
page.onUrlChanged = function(targetUrl) {
console.log('New URL: ' + targetUrl);
};
page.onLoadFinished = function(status) {
console.log('Load Finished: ' + status);
};
page.onLoadStarted = function() {
console.log('Load Started');
};
page.onNavigationRequested = function(url, type, willNavigate, main) {
console.log('Trying to navigate to: ' + url);
};
page.open("http://example.com", function(status){
page.evaluate(function(){
// click
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
document.querySelector("a").dispatchEvent(e);
});
setTimeout(function(){
phantom.exit();
}, 10000);
});
It prints
Trying to navigate to: http://example.com/
Request (#1): http://example.com/
Load Started
New URL: http://example.com/
Response (#1, stage "end"): http://example.com/
Load Finished: success
Trying to navigate to: http://www.iana.org/domains/example
Request (#2): http://www.iana.org/domains/example
Load Started
Trying to navigate to: http://www.iana.org/domains/reserved
Request (#3): http://www.iana.org/domains/reserved
Response (#2, stage "end"): http://www.iana.org/domains/example
New URL: http://www.iana.org/domains/reserved
Request (#4): http://www.iana.org/_css/2013.1/screen.css
Request (#5): http://www.iana.org/_js/2013.1/jquery.js
Request (#6): http://www.iana.org/_js/2013.1/iana.js
Response (#3, stage "end"): http://www.iana.org/domains/reserved
Response (#6, stage "end"): http://www.iana.org/_js/2013.1/iana.js
Response (#4, stage "end"): http://www.iana.org/_css/2013.1/screen.css
Response (#5, stage "end"): http://www.iana.org/_js/2013.1/jquery.js
Request (#7): http://www.iana.org/_img/2013.1/iana-logo-header.svg
Request (#8): http://www.iana.org/_img/2013.1/icann-logo.svg
Response (#8, stage "end"): http://www.iana.org/_img/2013.1/icann-logo.svg
Response (#7, stage "end"): http://www.iana.org/_img/2013.1/iana-logo-header.svg
Request (#9): http://www.iana.org/_css/2013.1/print.css
Response (#9, stage "end"): http://www.iana.org/_css/2013.1/print.css
Load Finished: success
It shows that clicking a link emits the LoadStarted event once and NavigationRequested event twice, because there is a redirect. The trick is to add the event handlers before doing the action:
var page = require('webpage').create();
page.open("http://example.com", function(status){
page.onLoadFinished = function(status) {
console.log('Load Finished: ' + status);
page.render("test37_next_page.png");
phantom.exit();
};
page.onLoadStarted = function() {
console.log('Load Started');
};
page.evaluate(function(){
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
document.querySelector("a").dispatchEvent(e);
});
});
If you need to do those things, maybe it is time to try something else like CasperJS. It runs on top of PhantomJS, but has a much better API for navigating web pages.
Use the high-level wrapper, nightmarejs.
You can easily click there and wait afterwards.
Here is the code (Examples section):
var Nightmare = require('nightmare');
new Nightmare()
.goto('http://yahoo.com')
.type('input[title="Search"]', 'github nightmare')
.click('.searchsubmit')
.run(function (err, nightmare) {
if (err) return console.log(err);
console.log('Done!');
});
More examples and API usage can be found at github
Here is my code based on some other answers. In my case, I didn't need to specifically evaluate any other javascript. I just needed to wait for the page to finish loading.
var system = require('system');
if (system.args.length === 1) {
console.log('Try to pass some arguments when invoking this script!');
}
else {
var page = require('webpage').create();
var address = system.args[1];
page.open(address, function(status){
page.onLoadFinished = function(status) {
console.log(page.content);
phantom.exit();
};
});
}
Save the above in a file called "scrape.js" and call it this way:
phantomjs --ssl-protocol=any --ignore-ssl-errors=true scrape.js https://www.example.com
The SSL-related params are added to avoid other issues that I was having with certain HTTPS sites (related to certificate loading issues).
Hope this helps someone!
My script works fine so far, loading every page in the text file line by line in sequentiell order (page.open is asynchron and the page object is global = overwriting on new requests, it's a big clusterfuck running multiple page.open() at once), matching every request for a specific domain and printing JSON values from it.
But if I use a .txt-file with over ~150 links, it just crashes every time, mostly with no error message and with no crash dump like this:
PhantomJS has crashed. Please read the crash reporting guide at
http://phantomjs.org/crash-reporting.html and file a bug report at
https://github.com/ariya/phantomjs/issues/new.
Unfortunately, no crash dump is available.
(Is %TEMP% (C:\Users\XXX\AppData\Local\Temp) a directory you cannot write?)
I can reproduce that easily if I run it multiple times, doesn't matter if I do it at once or one after one.
How can I prevent the crashes? My script is useless if Phantom can't handle that.
But sometimes I get a crash dump:
PhantomJS has crashed. Please read the crash reporting guide at
http://phantomjs.org/crash-reporting.html and file a bug report at
https://github.com/ariya/phantomjs/issues/new.
Please attach the crash dump file:
C:\Users\XXX\AppData\Local\Temp\a4fd6af6-1244-44d3-8938-3aabe298c2fa.dmp
https://www.dropbox.com/s/i3qi5ed33mbblie/500%20links%20-a4fd6af6-1244-44d3-8938-3aabe298c2fa.dmp?dl=1
https://www.dropbox.com/s/najdz9fhdexvav1/500%20links-%2095ebab5c-859b-40e9-936b-84967471779b.dmp?dl=1
https://www.dropbox.com/s/1d2t8rtev85yf96/500%20links%20-%20d450c8e1-9728-41c7-ba52-dfef466f0222.dmp?dl=1
And in rare cases I even get an error message, Process Explorer says the process has a maximum of 21 threads at once
QThread::start: Failed to create thread ()
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('500sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;
var hasFound = Array();
var hasonLoadFinished = Array();
function handle_page(line) {
var page = webPage.create();
page.settings.loadImages = false;
page.open(line, function() {});
page.onResourceRequested = function(requestData, request) {
var match = requestData.url.match(/example.de\/ac/g)
if (match != null) {
hasFound[line] = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t + " " + t['id']);
request.abort;
} else {
//hasFound = false;
return;
}
};
page.onLoadFinished = function(status) {
if (!hasonLoadFinished[line]) {
hasonLoadFinished[line] = true;
if (!hasFound[line]) {
console.log(i + " :NOT FOUND: " + line);
console.log("");
}
i++;
setTimeout(page.close, 200);
nextPage();
}
}
};
function nextPage() {
var line = stream.readLine();
if (!line) {
end = Date.now();
console.log("");
console.log(((end - start) / 1000) + " Sekunden");
phantom.exit(0);
}
hasFound[line] = false;
hasonLoadFinished[line] = false;
handle_page(line);
}
start = Date.now();
nextPage();
/edit crashed with 1.9.8 after 3836 links .... back to start ...........
Seems like the problem lies into the 2.0 version. Tested 1.9.8 out of frustration and - it works, 60% less RAM used, no crashes with 1000 Urls.
Crash report on github is done, what a relief, it works.
I have a phantomJS script that contains the following:
page.open(url, function (status) {
if (status === "fail") { /* handle failure */ }
});
The status check works sometimes, but the status will still be "success" even if the request returns 500. How can I get the actual request status code?
You can do it something like this:
var page = require('webpage').create(),
system = require('system'),
resources = [];
page.open('http://google.com', function (status) {
console.log('Loaded with http status:', resources[0].status);
phantom.exit();
});
page.onResourceReceived = function(response) {
// check if the resource is done downloading
if (response.stage !== "end") return;
// apply resource filter if needed:
if (response.headers.filter(function(header) {
if (header.name == 'Content-Type' && header.value.indexOf('text/html') == 0) {
return true;
}
return false;
}).length > 0)
resources.push(response);
};
So, if you need to check the status of the first browser's request (in this case google's html page) you should see it as the first one returned in resources[0].status. In onResourceReceived handler you can add more filters for resources you try to get http code from.
UPDATE: thanks to #fotijr, added a check for completed responses
In
page.property('onResourceError', function(res) {
resources variable is undefined,
even if I set it with
var page = require('webpage').create(),
system = require('system'),
resources = [];
I'm testing out PhantomJS and trying to return all startups listed on angel.co. I decided to go with PhantomJS since I would need to paginate through the front page by clicking "Next" at the bottom. Right now this code does not return any results. I'm completely new to PhantomJS and have read through all the code examples so any guidance would be much appreciated.
var page = require('webpage').create();
page.open('https://angel.co/startups', function(status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
page.evaluate(function() {
var list = document.querySelectorAll('div.resume');
for (var i = 0; i < list.length; i++){
console.log((i + 1) + ":" + list[i].innerText);
}
});
}
phantom.exit();
});
By default, console messages evaluated on the page will not appear in your PhantomJS console.
When you execute code under page.evaluate(...), that code is being executed in the context of the page. So when you have console.log((i + 1) + ":" + list[i].innerText);, that is being logged in the headless browser itself, rather than in PhantomJS.
If you want all console messages to be passed along to PhantomJS itself, use the following after opening your page:
page.onConsoleMessage = function (msg) { console.log(msg); };
page.onConsoleMessage is triggered whenever you print to the console from within the page. With this callback, you're asking PhantomJS to echo the message to its own standard output stream.
For reference, your final code would look like (this prints successfully for me):
var page = require('webpage').create();
page.open('https://angel.co/startups', function(status) {
page.onConsoleMessage = function (msg) { console.log(msg); };
if (status !== 'success') {
console.log('Unable to access network');
} else {
page.evaluate(function() {
var list = document.querySelectorAll('div.resume');
for (var i = 0; i < list.length; i++){
console.log((i + 1) + ":" + list[i].innerText);
}
});
}
phantom.exit();
});
I have been searching for an headless web browser that can run on server for web crawlers to index a single page application. Firslyt I tried HTMLUnit and Selenium (HtmlUnitDriver) but it seems both of them have issues with xhr requests.
And I discovered PhantomJS which performs better and seems mature. PhantomJS has an internal webserver so I decided to use it with my reverse proxy. However I ran a benchmark and PhantomJS hits a cpu core 100% and the average page loading the is around 4 seconds. The reason is I have to wait the browser to load all resources to be able to get correct results. Here is my PhantomJS script:
var page = require('webpage');
var system = require('system');
var server = require('webserver').create();
// credit: http://backbonetutorials.com/seo-for-single-page-apps/
var service = server.listen(port, { 'keepAlive': true }, function(z, response) {
var request = page.create();
var lastReceived = new Date().getTime();
var requestCount = 0;
var responseCount = 0;
var requestIds = [];
var startTime = new Date().getTime();
request.onResourceReceived = function (response) {
if (requestIds.indexOf(response.id) !== -1) {
lastReceived = new Date().getTime();
responseCount++;
requestIds[requestIds.indexOf(response.id)] = null;
}
};
request.onResourceRequested = function (request) {
if (requestIds.indexOf(request.id) === -1) {
requestIds.push(request.id);
requestCount++;
}
};
request.settings = {
loadImages: false,
javascriptEnabled: true,
loadPlugins: false
};
request.open(z.url, function (status, a) {
if (status !== 'success') {
console.log('FAIL to load the address '+a);
}
});
var checkComplete = function () {
var now = new Date().getTime();
if ((now - lastReceived > 300 && requestCount === responseCount) || now - startTime > 5000) {
clearInterval(checkCompleteInterval);
response.statusCode = 200;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/html; charset=UTF-8',
'Connection': 'Keep-Alive',
'Keep-Alive': 'timeout=5, max=100',
'Content-Length': request.content.length
};
response.write(request.content);
response.close();
request.release();
console.log(request.url+" -> "+(now - startTime));
}
}
var checkCompleteInterval = setInterval(checkComplete, 3);
});
Is there any improvement that can be done to speed up the script, should I just run PhantomJS using its shell command for better performance or is there any alternative to these browsers?
You can use some command line switches to improve the capture performance:
First, you can ignore all images with --load-images=no. There's no need to load images when doing the HTML snapshots.
You can also enable the cache with --disk-cache=yes (use --max-disk-cache-size to set its size in bytes)
Finally, the WebPage#onResourceRequested callback may also be useful to abort some requests (trackers, media files...) with the NetworkRequest#abort method.