How to optimize PhantomJS for search engines to index a single page application? - selenium

I have been searching for an headless web browser that can run on server for web crawlers to index a single page application. Firslyt I tried HTMLUnit and Selenium (HtmlUnitDriver) but it seems both of them have issues with xhr requests.
And I discovered PhantomJS which performs better and seems mature. PhantomJS has an internal webserver so I decided to use it with my reverse proxy. However I ran a benchmark and PhantomJS hits a cpu core 100% and the average page loading the is around 4 seconds. The reason is I have to wait the browser to load all resources to be able to get correct results. Here is my PhantomJS script:
var page = require('webpage');
var system = require('system');
var server = require('webserver').create();
// credit: http://backbonetutorials.com/seo-for-single-page-apps/
var service = server.listen(port, { 'keepAlive': true }, function(z, response) {
var request = page.create();
var lastReceived = new Date().getTime();
var requestCount = 0;
var responseCount = 0;
var requestIds = [];
var startTime = new Date().getTime();
request.onResourceReceived = function (response) {
if (requestIds.indexOf(response.id) !== -1) {
lastReceived = new Date().getTime();
responseCount++;
requestIds[requestIds.indexOf(response.id)] = null;
}
};
request.onResourceRequested = function (request) {
if (requestIds.indexOf(request.id) === -1) {
requestIds.push(request.id);
requestCount++;
}
};
request.settings = {
loadImages: false,
javascriptEnabled: true,
loadPlugins: false
};
request.open(z.url, function (status, a) {
if (status !== 'success') {
console.log('FAIL to load the address '+a);
}
});
var checkComplete = function () {
var now = new Date().getTime();
if ((now - lastReceived > 300 && requestCount === responseCount) || now - startTime > 5000) {
clearInterval(checkCompleteInterval);
response.statusCode = 200;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/html; charset=UTF-8',
'Connection': 'Keep-Alive',
'Keep-Alive': 'timeout=5, max=100',
'Content-Length': request.content.length
};
response.write(request.content);
response.close();
request.release();
console.log(request.url+" -> "+(now - startTime));
}
}
var checkCompleteInterval = setInterval(checkComplete, 3);
});
Is there any improvement that can be done to speed up the script, should I just run PhantomJS using its shell command for better performance or is there any alternative to these browsers?

You can use some command line switches to improve the capture performance:
First, you can ignore all images with --load-images=no. There's no need to load images when doing the HTML snapshots.
You can also enable the cache with --disk-cache=yes (use --max-disk-cache-size to set its size in bytes)
Finally, the WebPage#onResourceRequested callback may also be useful to abort some requests (trackers, media files...) with the NetworkRequest#abort method.

Related

Logs for Protractor

I am new to protractor and want to create logs for my test cases. I used if and else to write logs. I wanted to know if there is any better way of writing logs for protractor test cases?
My Code:
var colors = require('colors/safe');
var mapFeedBackpage=require('./mapFeedBack-page.js')
describe("Map feedback Automation",function()
{
var mapFeedBack= new mapFeedBackpage();
it("Check if the Url works ",function() //spec1
{
console.log("Checking the url :"+browser.params.url+'\n')
browser.get(browser.params.url);
browser.getCurrentUrl().then(function(value){
if(/report/.test(value) === false) {
fail("Result: URL doesnt works-FAIL \n");
}
else
{
console.log(colors.green("PASS :")+browser.params.url+ "is reachable \n");
}
});
});
it("test browser should reach report road option",function() //spec2s
{
console.log("Checking if road report option is available \n");
mapFeedBack.REPORT_ROAD.click();
expect(browser.getCurrentUrl()).toContain("report_road");
browser.getCurrentUrl().then(function(value){
if(/report_road/.test(value) === false) {
fail("Result: URL doesnt works-FAIL");
}
else
{
console.log(colors.green("PASS")+" Road report option is available");
}
});
});
Yes, you can use https://www.npmjs.com/package/log4js which is basically log4j module for nodejs apps. Since protractor is nodejs program it would certainly support this. It's very easy to implement this-
var log4js = require('log4js');
var logger = log4js.getLogger();
logger.debug("Some debug messages");
or you could write a custom logger:
var logger = exports;
logger.debugLevel = 'warn';
logger.log = function(level, message) {
var levels = ['error', 'warn', 'info'];
if (levels.indexOf(level) >= levels.indexOf(logger.debugLevel) ) {
if (typeof message !== 'string') {
message = JSON.stringify(message);
};
console.log(level+': '+message);
}
}
and then use this in your scripts as :
var logger = require('./logger');
logger.debugLevel = 'warn';
logger.log('info', 'Everything started properly.');
logger.log('warn', 'Running out of memory...');
logger.log('error', { error: 'flagrant'});

Click and open a new page in PhantomJS [duplicate]

Phantomjs has these two really handy callbacks onLoadStarted and onLoadFinished which allow you to essentially pause execution while the page is loading. But I've been searching and I can't find an equivalent for if you click() a submit button or hyperlink. A similar page load happens but onLoadStarted doesn't get called for this event I guess because there isn't an explicit page.open() that happens. I'm trying to figure out a clean way to suspend execution while this load takes place.
One solution is obviously nested setTimeout's but I'd like to avoid this scenario because it's hacky and relies on trial and error instead of something reliable and more robust like testing against something or waiting for an event.
Is there a specific callback for this kind of page load that I missed? Or maybe there's some kind of generic code pattern that can deal with this sort of thing?
EDIT:
I still haven't figured out how to get it to pause. Here's the code that doesn't call the onLoadStarted() function when I call the click() command:
var loadInProgress = false;
page.onLoadStarted = function() {
loadInProgress = true;
console.log("load started");
};
page.onLoadFinished = function() {
loadInProgress = false;
console.log("load finished");
};
page.open(loginPage.url, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
fs.write(filePath + errorState, 1, 'w');
phantom.exit();
} else {
page.evaluate(function (loginPage, credentials) {
console.log('inside loginPage evaluate function...\n')
document.querySelector('input[id=' + loginPage.userId + ']').value = credentials.username;
document.querySelector('input[id=' + loginPage.passId + ']').value = credentials.password;
document.querySelector('input[id=' + loginPage.submitId + ']').click();
//var aTags = document.getElementsByTagName('a')
//aTags[1].click();
}, loginPage, credentials);
page.render(renderPath + 'postLogin.png');
console.log('rendered post-login');
I double checked that the id is correct. The page.render() will show that the information is submitted, but only if I put it in a setTimeout(), otherwise it renders it immediately and I only see the credentials inputted, before the page redirect. Maybe I'm missing something else?
I think the onLoadStarted and onLoadFinished functions are everything you need. Take for example the following script:
var page = require('webpage').create();
page.onResourceReceived = function(response) {
if (response.stage !== "end") return;
console.log('Response (#' + response.id + ', stage "' + response.stage + '"): ' + response.url);
};
page.onResourceRequested = function(requestData, networkRequest) {
console.log('Request (#' + requestData.id + '): ' + requestData.url);
};
page.onUrlChanged = function(targetUrl) {
console.log('New URL: ' + targetUrl);
};
page.onLoadFinished = function(status) {
console.log('Load Finished: ' + status);
};
page.onLoadStarted = function() {
console.log('Load Started');
};
page.onNavigationRequested = function(url, type, willNavigate, main) {
console.log('Trying to navigate to: ' + url);
};
page.open("http://example.com", function(status){
page.evaluate(function(){
// click
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
document.querySelector("a").dispatchEvent(e);
});
setTimeout(function(){
phantom.exit();
}, 10000);
});
It prints
Trying to navigate to: http://example.com/
Request (#1): http://example.com/
Load Started
New URL: http://example.com/
Response (#1, stage "end"): http://example.com/
Load Finished: success
Trying to navigate to: http://www.iana.org/domains/example
Request (#2): http://www.iana.org/domains/example
Load Started
Trying to navigate to: http://www.iana.org/domains/reserved
Request (#3): http://www.iana.org/domains/reserved
Response (#2, stage "end"): http://www.iana.org/domains/example
New URL: http://www.iana.org/domains/reserved
Request (#4): http://www.iana.org/_css/2013.1/screen.css
Request (#5): http://www.iana.org/_js/2013.1/jquery.js
Request (#6): http://www.iana.org/_js/2013.1/iana.js
Response (#3, stage "end"): http://www.iana.org/domains/reserved
Response (#6, stage "end"): http://www.iana.org/_js/2013.1/iana.js
Response (#4, stage "end"): http://www.iana.org/_css/2013.1/screen.css
Response (#5, stage "end"): http://www.iana.org/_js/2013.1/jquery.js
Request (#7): http://www.iana.org/_img/2013.1/iana-logo-header.svg
Request (#8): http://www.iana.org/_img/2013.1/icann-logo.svg
Response (#8, stage "end"): http://www.iana.org/_img/2013.1/icann-logo.svg
Response (#7, stage "end"): http://www.iana.org/_img/2013.1/iana-logo-header.svg
Request (#9): http://www.iana.org/_css/2013.1/print.css
Response (#9, stage "end"): http://www.iana.org/_css/2013.1/print.css
Load Finished: success
It shows that clicking a link emits the LoadStarted event once and NavigationRequested event twice, because there is a redirect. The trick is to add the event handlers before doing the action:
var page = require('webpage').create();
page.open("http://example.com", function(status){
page.onLoadFinished = function(status) {
console.log('Load Finished: ' + status);
page.render("test37_next_page.png");
phantom.exit();
};
page.onLoadStarted = function() {
console.log('Load Started');
};
page.evaluate(function(){
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
document.querySelector("a").dispatchEvent(e);
});
});
If you need to do those things, maybe it is time to try something else like CasperJS. It runs on top of PhantomJS, but has a much better API for navigating web pages.
Use the high-level wrapper, nightmarejs.
You can easily click there and wait afterwards.
Here is the code (Examples section):
var Nightmare = require('nightmare');
new Nightmare()
.goto('http://yahoo.com')
.type('input[title="Search"]', 'github nightmare')
.click('.searchsubmit')
.run(function (err, nightmare) {
if (err) return console.log(err);
console.log('Done!');
});
More examples and API usage can be found at github
Here is my code based on some other answers. In my case, I didn't need to specifically evaluate any other javascript. I just needed to wait for the page to finish loading.
var system = require('system');
if (system.args.length === 1) {
console.log('Try to pass some arguments when invoking this script!');
}
else {
var page = require('webpage').create();
var address = system.args[1];
page.open(address, function(status){
page.onLoadFinished = function(status) {
console.log(page.content);
phantom.exit();
};
});
}
Save the above in a file called "scrape.js" and call it this way:
phantomjs --ssl-protocol=any --ignore-ssl-errors=true scrape.js https://www.example.com
The SSL-related params are added to avoid other issues that I was having with certain HTTPS sites (related to certificate loading issues).
Hope this helps someone!

Single page application client and server routing

I've got the following code to handle client side navigation using HTML5 pushstate (classic combination of crossroadsjs and historyjs):
History = window.History;
History.Adapter.bind(window, 'statechange', function () {
var state = History.getState();
console.log(state);
if (state.data.urlPath) {
return crossroads.parse(state.data.urlPath);
}
else
{
if (state.hash.length > 1) {
var fullHash = state.hash;
var hashPath = fullHash.slice(0, fullHash.indexOf('?'));
return crossroads.parse(hashPath);
}
}});
crossroads.normalizeFn = crossroads.NORM_AS_OBJECT;
crossroads.parse('/');
$('body').on('click', 'a', function(e) {
var title, urlPath;
urlPath = $(this).attr('href');
if (urlPath.slice(0, 1) == '#'){
return true;
}
e.preventDefault();
title = $(this).text().trim();
return History.pushState({ urlPath: urlPath }, title, urlPath);
});
It works really well. Now, to handle url bookmarking and sharing, I added and express server to handle all requests. All it does is to redirect to index.html (a sort of catchall rule):
var env = require('./env');
var fallback = require('express-history-api-fallback');
var express = require('express');
var app = express();
var config = env.config();
var root = __dirname + '/dist';
app.use(express.static(root));
app.use(fallback('index.html', { root: root }));
var port = process.env.PORT || 9090;
var server = app.listen(port, function () {
console.log('Server started at: http://localhost:' + port);
console.log(config);
});
The problem I am facing is that it successfully redirects to index.html but it doesn't load the correct route on the client side. So a request to www.mysite.com or www.mysite.com/anotherpage will always load the home page route.
I am obviously missing some code to intercept that and load the appropriate route on the client side. I just don't know what to do.
Found where the bug was:
crossroads.parse('/');
This was always redirecting to the "home" route. I just had to refactor the code a bit:
History.Adapter.bind(window, 'statechange', this.routeCrossRoads);
routeCrossRoads() {
var state = History.getState();
if (state.data.urlPath) {
return crossroads.parse(state.data.urlPath);
}
else {
if (state.hash.length > 1) {
var fullHash = state.hash;
var pos = fullHash.indexOf('?');
if (pos > 0) {
var hashPath = fullHash.slice(0, pos);
return crossroads.parse(hashPath);
}
else {
return crossroads.parse(fullHash);
}
}
else {
return crossroads.parse('/');
}
}
}

How can I see the HTTP status code from the request made by page.open?

I have a phantomJS script that contains the following:
page.open(url, function (status) {
if (status === "fail") { /* handle failure */ }
});
The status check works sometimes, but the status will still be "success" even if the request returns 500. How can I get the actual request status code?
You can do it something like this:
var page = require('webpage').create(),
system = require('system'),
resources = [];
page.open('http://google.com', function (status) {
console.log('Loaded with http status:', resources[0].status);
phantom.exit();
});
page.onResourceReceived = function(response) {
// check if the resource is done downloading
if (response.stage !== "end") return;
// apply resource filter if needed:
if (response.headers.filter(function(header) {
if (header.name == 'Content-Type' && header.value.indexOf('text/html') == 0) {
return true;
}
return false;
}).length > 0)
resources.push(response);
};
So, if you need to check the status of the first browser's request (in this case google's html page) you should see it as the first one returned in resources[0].status. In onResourceReceived handler you can add more filters for resources you try to get http code from.
UPDATE: thanks to #fotijr, added a check for completed responses
In
page.property('onResourceError', function(res) {
resources variable is undefined,
even if I set it with
var page = require('webpage').create(),
system = require('system'),
resources = [];

CasperJS looking for 404 error on links site

I'm beginner programmer. I found nice script
http://planzero.org/blog/2013/03/07/spidering_the_web_with_casperjs
I tried to rewrite this script with CasperJS test framework.
I would to get xunit report from this code
var startUrl = 'http://yoursite.foo';
var visitedUrls = [], pendingUrls = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
}});
var utils = require('utils')
var helpers = require('helpers')
// Spider from the given URL
casper.test.begin('href' , function(test) {
casper.start(startUrl, function() {
function spider(url) {
// Add the URL to the visited stack
visitedUrls.push(url);
// Open the URL
casper.open(url).then(function() {
test.assertHttpStatus(200, ":" + url);
// Find links present on this page
var links = this.evaluate(function() {
var links = [];
Array.prototype.forEach.call(__utils__.findAll('a'), function(e) {
links.push(e.getAttribute('href'));
});
return links;
});
// Add newly found URLs to the stack
var baseUrl = this.getGlobal('location').origin;
Array.prototype.forEach.call(links, function(link) {
var newUrl = helpers.absoluteUri(baseUrl, link);
if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1 && !(link.search(startUrl) == -1)) {
pendingUrls.push(newUrl);
}
});
// If there are URLs to be processed
if (pendingUrls.length > 0) {
var nextUrl = pendingUrls.shift();
spider(nextUrl);
}
else {
console.log('links ended');
this.break;
}
});
}
spider(startUrl);
}).run(function(){
test.done();
});
});
Script is running but when he and Job I can't get report.
If you're trying to learn how to use CasperJS you need to start with a smaller example than that. That script is a mess which goes after a site named yoursite.foo (maybe you put that name in there?)
I would take small steps. I have a video which may help explain how to use CasperJS.
http://www.youtube.com/watch?v=Kefil5tCL9o