My script works fine so far, loading every page in the text file line by line in sequentiell order (page.open is asynchron and the page object is global = overwriting on new requests, it's a big clusterfuck running multiple page.open() at once), matching every request for a specific domain and printing JSON values from it.
But if I use a .txt-file with over ~150 links, it just crashes every time, mostly with no error message and with no crash dump like this:
PhantomJS has crashed. Please read the crash reporting guide at
http://phantomjs.org/crash-reporting.html and file a bug report at
https://github.com/ariya/phantomjs/issues/new.
Unfortunately, no crash dump is available.
(Is %TEMP% (C:\Users\XXX\AppData\Local\Temp) a directory you cannot write?)
I can reproduce that easily if I run it multiple times, doesn't matter if I do it at once or one after one.
How can I prevent the crashes? My script is useless if Phantom can't handle that.
But sometimes I get a crash dump:
PhantomJS has crashed. Please read the crash reporting guide at
http://phantomjs.org/crash-reporting.html and file a bug report at
https://github.com/ariya/phantomjs/issues/new.
Please attach the crash dump file:
C:\Users\XXX\AppData\Local\Temp\a4fd6af6-1244-44d3-8938-3aabe298c2fa.dmp
https://www.dropbox.com/s/i3qi5ed33mbblie/500%20links%20-a4fd6af6-1244-44d3-8938-3aabe298c2fa.dmp?dl=1
https://www.dropbox.com/s/najdz9fhdexvav1/500%20links-%2095ebab5c-859b-40e9-936b-84967471779b.dmp?dl=1
https://www.dropbox.com/s/1d2t8rtev85yf96/500%20links%20-%20d450c8e1-9728-41c7-ba52-dfef466f0222.dmp?dl=1
And in rare cases I even get an error message, Process Explorer says the process has a maximum of 21 threads at once
QThread::start: Failed to create thread ()
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('500sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;
var hasFound = Array();
var hasonLoadFinished = Array();
function handle_page(line) {
var page = webPage.create();
page.settings.loadImages = false;
page.open(line, function() {});
page.onResourceRequested = function(requestData, request) {
var match = requestData.url.match(/example.de\/ac/g)
if (match != null) {
hasFound[line] = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t + " " + t['id']);
request.abort;
} else {
//hasFound = false;
return;
}
};
page.onLoadFinished = function(status) {
if (!hasonLoadFinished[line]) {
hasonLoadFinished[line] = true;
if (!hasFound[line]) {
console.log(i + " :NOT FOUND: " + line);
console.log("");
}
i++;
setTimeout(page.close, 200);
nextPage();
}
}
};
function nextPage() {
var line = stream.readLine();
if (!line) {
end = Date.now();
console.log("");
console.log(((end - start) / 1000) + " Sekunden");
phantom.exit(0);
}
hasFound[line] = false;
hasonLoadFinished[line] = false;
handle_page(line);
}
start = Date.now();
nextPage();
/edit crashed with 1.9.8 after 3836 links .... back to start ...........
Seems like the problem lies into the 2.0 version. Tested 1.9.8 out of frustration and - it works, 60% less RAM used, no crashes with 1000 Urls.
Crash report on github is done, what a relief, it works.
Related
I am trying to teach myself nodejs and expressjs, however coming from java and c++ this is proving difficult to get used to.
I made a simple and messy module that it is supposed to return a weather forecast for a given zip code.
The way this happens is by taking the user zip code and using a google api to generate the geo coordinates for that zip code. I get the coordinates from the JASON file and then provide them to the next api call, this call is done to the forecast.io api and this time the weather data for the location is also taken from a JASON file.
Coming from java and with a not so solid background on JavaScript I am having a hard time making these two functions wait for one another, in this case I need the google api call to finish first because the coordinates it will provide are needed for the second api call. Can someone take a look at this code and tell me if the strategy I used is correct/ provide a suggestion so that I can know what is done in javascript in situations like this.
here is the code:
// The required modules.
var http = require("http");
var https = require("https");
//result object
var resultSet = {
latitude :"",
longitude:"",
localInfo:"",
weather:"",
humidity:"",
pressure:"",
time:""
};
//print out error messages
function printError(error){
console.error(error.message);
}
//Forecast API required information:
//key for the forecast IO app
var forecast_IO_Key = "this is my key, not publishing for security reasons";
var forecast_IO_Web_Adress = "https://api.forecast.io/forecast/";
//Create Forecast request string function
function createForecastRequest(latitude, longitude){
var request = forecast_IO_Web_Adress + forecast_IO_Key + "/"
+ latitude +"," + longitude;
return request;
}
//Google GEO API required information:
//Create Google Geo Request
var google_GEO_Web_Adress = "https://maps.googleapis.com/maps/api/geocode/json?address=";
function createGoogleGeoMapRequest(zipCode){
var request = google_GEO_Web_Adress+zipCode + "&sensor=false";
return request;
}
function get(zipCode){
// 1- Need to request google for geo locations using a given zip
var googleRequest = https.get(createGoogleGeoMapRequest(zipCode), function(response){
//console.log(createGoogleGeoMapRequest(zipCode));
var body = "";
var status = response.statusCode;
//a- Read the data.
response.on("data", function(chunk){
body+=chunk;
});
//b- Parse the data.
response.on("end", function(){
if(status === 200){
try{
var coordinates = JSON.parse(body);
resultSet.latitude = coordinates.results[0].geometry.location.lat;
resultSet.longitude = coordinates.results[0].geometry.location.lng;
resultSet.localInfo = coordinates.results[0].address_components[0].long_name + ", " +
coordinates.results[0].address_components[1].long_name + ", " +
coordinates.results[0].address_components[2].long_name + ", " +
coordinates.results[0].address_components[3].long_name + ". ";
}catch(error){
printError(error.message);
}finally{
connectToForecastIO(resultSet.latitude,resultSet.longitude);
}
}else{
printError({message: "Error with GEO API"+http.STATUS_CODES[response.statusCode]})
}
});
});
function connectToForecastIO(latitude,longitude){
var forecastRequest = https.get(createForecastRequest(latitude,longitude),function(response){
// console.log(createForecastRequest(latitude,longitude));
var body = "";
var status = response.statusCode;
//read the data
response.on("data", function(chunk){
body+=chunk;
});
//parse the data
response.on("end", function(){
try{
var weatherReport = JSON.parse(body);
resultSet.weather = weatherReport.currently.summary;
resultSet.humidity = weatherReport.currently.humidity;
resultSet.temperature = weatherReport.currently.temperature;
resultSet.pressure = weatherReport.currently.pressure;
resultSet.time = weatherReport.currently.time;
}catch(error){
printError(error.message);
}finally{
return resultSet;
}
});
});
}
}
//define the name of the outer module.
module.exports.get = get;
is the return statement properly placed? Is my use of finally proper in here? Please notice that I come from a java background and in java is perfectly fine to use the try{} catch(){} and finally{} blocks to execute closure code, it was the only way i managed this module to work. But now that i have incorporated some Express and I try to execute this module's method from another module, all I am getting is an undefined return.
You could use the Promise API, kind of like Futures in Java, so basically what you could do is wrap both functions in promises and the you could wait for resolve to execute the next function
var googleRequest = function(zipcode) {
return new Promise(function(resolve, reject) {
var request = https.get(createGoogleGeoMapRequest(zipCode), function(response) {
if (response.statusCode !== 200) {
reject(new Error('Failed to get request status:' + response.statusCode));
}
var body = "";
//a- Read the data.
response.on("data", function(chunk) {
body+=chunk;
});
//b- Parse the data.
response.on("end", function(body) {
var coordinates = JSON.parse(body);
resultSet.latitude = coordinates.results[0].geometry.location.lat;
resultSet.longitude = coordinates.results[0].geometry.location.lng;
resultSet.localInfo = coordinates.results[0].address_components[0].long_name + ", " +
coordinates.results[0].address_components[1].long_name + ", " +
coordinates.results[0].address_components[2].long_name + ", " +
coordinates.results[0].address_components[3].long_name + ". ";
resolve(resultSet);
})
});
request.on('error', function(err) {
reject(err);
});
});
}
After that you could just do
googleRequest(90210).then(function(result) {
connectToForecastIO(result.latitude, result.longitude);
}
You can find out more about Promise's usage in the Promise API docs
You should also note that there are several libraries available that allow for promise based http requests such as fetch
I'm using PhantomJS to automate a page. What I do is:
do{
console.log(i);
i++;
page.open(url);
do { phantom.page.sendEvent('mousemove'); } while (page.loading);
if(page.injectJs('./Search.js') == false){
console.log("Search.js Failed")
}
var links = page.evaluate(function(json){
return search(json)
},json)
console.log(links);
} while(links == "")
So this leads me to opening the website repeated until what I'm looking for appears. But this also leads me to getting IP banned. What can I do to get around this?
Your IP is probably getting banned because the script generates too many requests to the website in very little time. So, you need to throttle requests, to apply a pause between them.
I would rewrite your script like this:
var page = require('webpage').create();
var url = "http://www.website.tld/";
var json = {"some" : "json"};
var i = 0;
var links;
// We abstract main code to a function so that we can call it
// again and again from itself
function getlinks (url, json) {
i++;
console.log(i);
page.open(url);
do { phantom.page.sendEvent('mousemove'); } while (page.loading);
if(page.injectJs('./Search.js') == false){
console.log("Search.js Failed")
}
var links = page.evaluate(function(json){
return search(json);
}, json);
if(links == "")
{
// No links scraped yet, so we wait for 3 seconds and try again
setTimeout(function(){
getlinks(url, json);
}, 3000)
}
else
{
console.log(links);
phantom.exit();
}
}
getlinks(url, json);
I'm beginner programmer. I found nice script
http://planzero.org/blog/2013/03/07/spidering_the_web_with_casperjs
I tried to rewrite this script with CasperJS test framework.
I would to get xunit report from this code
var startUrl = 'http://yoursite.foo';
var visitedUrls = [], pendingUrls = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
}});
var utils = require('utils')
var helpers = require('helpers')
// Spider from the given URL
casper.test.begin('href' , function(test) {
casper.start(startUrl, function() {
function spider(url) {
// Add the URL to the visited stack
visitedUrls.push(url);
// Open the URL
casper.open(url).then(function() {
test.assertHttpStatus(200, ":" + url);
// Find links present on this page
var links = this.evaluate(function() {
var links = [];
Array.prototype.forEach.call(__utils__.findAll('a'), function(e) {
links.push(e.getAttribute('href'));
});
return links;
});
// Add newly found URLs to the stack
var baseUrl = this.getGlobal('location').origin;
Array.prototype.forEach.call(links, function(link) {
var newUrl = helpers.absoluteUri(baseUrl, link);
if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1 && !(link.search(startUrl) == -1)) {
pendingUrls.push(newUrl);
}
});
// If there are URLs to be processed
if (pendingUrls.length > 0) {
var nextUrl = pendingUrls.shift();
spider(nextUrl);
}
else {
console.log('links ended');
this.break;
}
});
}
spider(startUrl);
}).run(function(){
test.done();
});
});
Script is running but when he and Job I can't get report.
If you're trying to learn how to use CasperJS you need to start with a smaller example than that. That script is a mess which goes after a site named yoursite.foo (maybe you put that name in there?)
I would take small steps. I have a video which may help explain how to use CasperJS.
http://www.youtube.com/watch?v=Kefil5tCL9o
I'm testing out PhantomJS and trying to return all startups listed on angel.co. I decided to go with PhantomJS since I would need to paginate through the front page by clicking "Next" at the bottom. Right now this code does not return any results. I'm completely new to PhantomJS and have read through all the code examples so any guidance would be much appreciated.
var page = require('webpage').create();
page.open('https://angel.co/startups', function(status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
page.evaluate(function() {
var list = document.querySelectorAll('div.resume');
for (var i = 0; i < list.length; i++){
console.log((i + 1) + ":" + list[i].innerText);
}
});
}
phantom.exit();
});
By default, console messages evaluated on the page will not appear in your PhantomJS console.
When you execute code under page.evaluate(...), that code is being executed in the context of the page. So when you have console.log((i + 1) + ":" + list[i].innerText);, that is being logged in the headless browser itself, rather than in PhantomJS.
If you want all console messages to be passed along to PhantomJS itself, use the following after opening your page:
page.onConsoleMessage = function (msg) { console.log(msg); };
page.onConsoleMessage is triggered whenever you print to the console from within the page. With this callback, you're asking PhantomJS to echo the message to its own standard output stream.
For reference, your final code would look like (this prints successfully for me):
var page = require('webpage').create();
page.open('https://angel.co/startups', function(status) {
page.onConsoleMessage = function (msg) { console.log(msg); };
if (status !== 'success') {
console.log('Unable to access network');
} else {
page.evaluate(function() {
var list = document.querySelectorAll('div.resume');
for (var i = 0; i < list.length; i++){
console.log((i + 1) + ":" + list[i].innerText);
}
});
}
phantom.exit();
});
I want to go to select a file from sdcard and upload it to server. is it possible to access the sdcard in android via phonegap as how we are picking a image from gallery and uploading. I went through samples but all are specifying the file name also like eg: mnt/sdcard/read.txt. But i want to goto only sdcard so that user can select his own file is it possible to do.
U can easily do that its very easy
window.requestFileSystem(LocalFileSystem.PERSISTENT, 0, onFileSystemSuccessUpload, fail);
function onFileSystemSuccessUpload(fileSystem) {
// get directory entry through root and access all the folders
var directoryReader = fileSystem.root.createReader();
// Get a list of all the entries in the directory
directoryReader.readEntries(successReader,fail);
}
function successReader(entries) {
var i;
for (i=0; i<entries.length; i++) {
//alert(entries[i].name);
if(entries[i].isDirectory==true)
{
var directoryReaderIn = entries[i].createReader();
directoryReaderIn.readEntries(successReader,fail);
}
if(entries[i].isFile==true)
{
entries[i].file(uploadFile, fail);
}
}
};
function uploadFile(file) {
var target=""; //the url to upload on server
var ft = new FileTransfer(),path = "file://"+ file.fullPath,name = file.name;
ft.upload(path, target, win, fail, { fileName: name });
// var ft = new FileTransfer();
//ft.upload(file.fullPath, target, win, fail, options);
function win(r) {
alert("Code = " + r.responseCode);
alert("Response = " + r.response);
alert("Sent = " + r.bytesSent);
}
function fail(error) {
alert("An error has occurred: Code = " + error.code);
}
}