I can't render this page with splash - scrapy-splash

I have tried all the suggestions in splash documentation:
This is the link: 'https://app.microacquire.com/startup/az1Hi7Znn9PpVc7I6EsgLbgQv4A3/uckABGneMbqRx0BtI866'
This is the script (I even add the same headers as in a normal request:
function main(splash, args)
splash.indexeddb_enabled = true
splash.plugins_enabled = true
splash.private_mode_enabled = false
splash:set_custom_headers({
["Connection"] = "keep-alive",
['authority']= 'app.microacquire.com',
['accept']='text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,
['accept-language']= 'es-ES,es;q=0.9,en;q=0.8',
['cache-control']= 'max-age=0',
['if-modified-since']= 'Thu, 14 Jul 2022 16:35:49 GMT',
['if-none-match']= '"1d1d916cd820c5eb54078f6fe3134fb2187ea0a14f9117866436828d1e447fd1-br"',
['sec-ch-ua']= '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
['sec-ch-ua-mobile']= '?0',
['sec-ch-ua-platform']= 'Windows',
['sec-fetch-dest']= 'document',
['sec-fetch-mode']= 'navigate',
['sec-fetch-site']= 'none',
['sec-fetch-user']= '?1',
['upgrade-insecure-requests']= '1',
['user-agent']= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
})
assert(splash:go(args.url))
assert(splash:wait(15))
return {
html = splash:html(),
png = splash:png(),
har = splash:har(),
}
end
it renders only this output, it doesnt load completely:
splash image

Related

BeautifulSoup: How to get only one part of <p> output?

I am trying to find the solution how to get only the price without text from the paragraph.
from bs4 import BeautifulSoup
import requests
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
p = requests.get(url = 'https://www.tia-mobiteli.hr/detaljan-prikaz.aspx?gid=11-appise_64wheu', headers = headers)
soup = BeautifulSoup(p.content,'lxml')
price = soup.find('div', class_='widget widget-info widget-price').p.text
price2 = price.strip()
print(price2)
My output is:
Naša najniža cijena za gotovinsko/virmansko plaćanje: 3.649,00 kn
I want to get only:
3.649,00 kn
Or if it is possible:
3649.00
The price is inside <b> tag:
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
p = requests.get(
url="https://www.tia-mobiteli.hr/detaljan-prikaz.aspx?gid=11-appise_64wheu",
headers=headers,
)
soup = BeautifulSoup(p.content, "lxml")
price = soup.find("div", class_="widget widget-info widget-price").b.text
price = float(price.split()[0].replace(".", "").replace(",", "."))
print(price)
Prints:
3649.0
You can use parse module that acts like reverse format().
Usage:
import parse
...
float(parse.parse('Naša najniža cijena za gotovinsko/virmansko plaćanje: {} kn',price2)[0].replace('.','').replace(',','.'))

How to change the header just for a specific request in scrapy spider?

I am trying to build a web crawler using scrapy. I want to change useragent for a single request in the spider. I tried the below code but the user agent is not being updated during the crawl process.
def start_requests(self):
request = Request(
"url",
callback=self.parse_search,
meta={'xpaths': self.xpaths},
headers={
"User-Agent": "Googlebot-Image/1.0"
}
)
return [request]
Your code works perfectly (see my code). But some middleware on your side may affect your User-Agent header:
class UserAgentSpider(scrapy.Spider):
name = 'useragent_spider'
user_agents = [
{'title': 'Galaxy S9', 'value': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36'},
{'title': 'iPhone', 'value': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/69.0.3497.105 Mobile/15E148 Safari/605.1'},
{'title': 'Edge', 'value': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'},
]
def start_requests(self):
for user_agent in self.user_agents:
yield scrapy.Request(
url="https://www.myip.com/",
headers={
'user-agent': user_agent['value'],
},
cb_kwargs={
'user_agent': user_agent['title']
},
callback=self.parse,
dont_filter=True,
)
def parse(self, response, user_agent):
with open(f"Samples/{user_agent}.htm", 'wb') as f:
f.write(response.body)

Getting a 403 Forbidden when going to a URL with PhantomJS

If I go to the following web page in Chrome, it loads fine: https://www.cruisemapper.com/?poi=39
However, when I run the following PhantomJS script, which simply goes to the same URL and outputs the entire DOM string to the console, I get a 403 Forbidden message:
var page = require('webpage').create(),
url = 'https://www.cruisemapper.com/?poi=39';
page.open(url, function (status) {
if (status === 'success') {
console.log(page.evaluate(function () {
return document.documentElement.outerHTML;
}));
phantom.exit();
}
});
Here's the exact output to the console:
<html><head>
<title>403 Forbidden</title>
</head><body>
<h1>Forbidden</h1>
<p>You don't have permission to access /
on this server.<br>
</p>
</body></html>
I thought that if I added some sort of user agent string, it might work. As such, I added the following above the console.log line:
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36';
But that didn't work. So then I tried the following instead:
page.customHeaders = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
};
But that didn't work either. Does anyone have any advice on how I can possibly hit up the URL above and not get a 403 Forbidden message? Thank you.
Your code works for me fine (I's suggest viewport size emulation though, see code). If you still get a 403, try changing your IP, it's possible that the site is on to you now (you probably visited that page lots of times).
var page = require('webpage').create(),
url = 'https://www.cruisemapper.com/?poi=39';
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36';
page.viewportSize = { width: 1440, height: 900 }; // <-- otherwise it's 400x300 by default
// It's good to watch for errors on the page
page.onError = function (msg, trace)
{
console.log(msg);
trace.forEach(function(item) {
console.log(' ', item.file, ':', item.line);
})
}
page.open(url, function (status) {
console.log(status);
page.render("page.png"); // Also useful to check if you get what you expect
if (status === 'success') {
console.log(page.evaluate(function () {
return document.documentElement.outerHTML;
}));
phantom.exit();
}
});

Mink + PhantomJS: How do I set the user agent?

I am using phantomjs with mink:
default:
extensions:
Behat\MinkExtension\Extension:
goutte: ~
selenium2:
browser: phantomjs
wd_host: http://localhost:8643/wd/hub
capabilities:
webStorageEnabled: true
But I need to masquerade as the latest chrome. I have tried this:
/**
* #BeforeStep
*/
public function masqueradeAsLatestChrome(StepEvent $event)
{
$this->getSession()->setRequestHeader('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36');
}
But I get the exception:
[Behat\Mink\Exception\UnsupportedDriverActionException]
Exception has been thrown in "beforeStep" hook, defined in FeatureContext::masqueradeAsLatestChrome()
Request header is not supported by Behat\Mink\Driver\Selenium2Driver
The version of chrome isn't critical but the web application must think its talking to a very recent version of chrome.
Selenium does not provide this capability, as it is not something a
user can do. It's recommended you use a proxy to inject additional
headers to the requests generated by the browser.
https://code.google.com/p/selenium/issues/detail?id=2047#c1
Sadly… However, the PhantomJS does provide an interface for setting the headers. Your best shot would be to send a direct command to it using it's REST API. There's also a cool PHP wrap library that would make it 200 times easier.
You should use the new Behat/Mink driver made by Juan Francisco Calderón Zumba
https://github.com/jcalderonzumba
Here is a direct link to the driver https://github.com/jcalderonzumba/MinkPhantomJSDriver
This driver allows you to specify the request headers that you need
(It works with Behat 3.0 but I think it requires at least PHP 5.4)
you can pass additional settings in selenium2driver via
extra_capabilities:
so in your case:
default:
extensions:
Behat\MinkExtension\Extension:
goutte: ~
selenium2:
browser: phantomjs
wd_host: http://localhost:8643/wd/hub
capabilities:
webStorageEnabled: true
extra_capabilities:
phantomjs.page.settings.userAgent: "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"
For https://github.com/facebook/php-webdriver you can use this case:
$capabilities = [
WebDriverCapabilityType::BROWSER_NAME => 'phantomjs',
WebDriverCapabilityType::PLATFORM => 'ANY',
WebDriverCapabilityType::ACCEPT_SSL_CERTS => false,
WebDriverCapabilityType::JAVASCRIPT_ENABLED => true,
'phantomjs.page.settings.userAgent' => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
];

CasperJS fetchText() function echoing blank output

I'm trying to use fetchText() to print out the URL of a google search result to the terminal. Here's the image of what exactly I'm trying to print.
It's only prints out blank though! I don't see anything I'm doing wrong?
Code:
phantom.casperPath = "/usr/local/Cellar/casperjs/1.0.3/libexec/";
phantom.injectJs(phantom.casperPath + '/bootstrap.js');
var utils = require('utils');
var casper = require('casper').create();
casper.start('https://www.google.com/search?q=amazon+shoes');
casper.wait(3000, function () {
this.echo(this.fetchText('#rso > div:nth-child(1) > li:nth-child(1) > div > div > div > div.f.kv._TD > cite'));
}).run();
Google will change the page depending on the useragent string. So you need to set a string during creation (with example string)
var casper = require("casper").create({
pageSettings: {
userAgent: "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
}
});
or with specific function
casper.userAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36");
Sometimes it is also necessary to set the viewport to something desktop-like, because PhantomJS' default viewport is 400x300 and Google might render a different site based on the viewport.