I am new at scrapy. I am getting none insted of item here is my code
class IndiaSpider(scrapy.Spider):
name = 'espace'
allowed_domains = ['worldwide.espacenet.com']
search_value = 'laptop'
start_urls = [f'https://worldwide.espacenet.com/patent/search?q={search_value}']
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
def request_header(self):
yield scrapy.Request(url=self.start_urls, callback=self.parse, headers={'User-Agent':self.user_agent})
def parse(self, response):
title = response.xpath("//span[#class='h2--2VrrSjFb item__content--title--dYTuyzV6']/text()").extract_first()
yield{
'title':title
}
I am getting
2023-01-17 15:58:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://worldwide.espacenet.com/patent/search?q=laptop> (referer: None)
2023-01-17 15:58:54 [scrapy.core.scraper] DEBUG: Scraped from <200 https://worldwide.espacenet.com/patent/search?q=laptop>
{'title': None}
2023-01-17 15:58:54 [scrapy.core.engine] INFO: Closing spider (finished)
Anyone can help me...?
See the comments in the code. Read this, and this.
Basically when you have data that's loaded with JavaScript you'll want to get it from the API. If you open devtools in your browser you can see where the data is loaded from and try to recreate the request with scrapy, and then parse the data from the JSON file.
Lose the request_header method, it's not part of the Spider's methods and you never call it. You probably wanted to use start_requests.
import json
import scrapy
class IndiaSpider(scrapy.Spider):
name = 'espace'
allowed_domains = ['worldwide.espacenet.com']
search_value = 'laptop'
# browser devtools -> network tab -> JSON url -> headers
headers = {
"Accept": "application/json,application/i18n+xml",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json",
"DNT": "1",
"EPO-Trace-Id": "YOUR ID", # <------ copy it from your browser
"Host": "worldwide.espacenet.com",
"Origin": "https://worldwide.espacenet.com",
"Pragma": "no-cache",
"Referer": "https://worldwide.espacenet.com/patent/search?q=laptop",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"X-EPO-PQL-Profile": "cpci"
}
api_url = f'https://worldwide.espacenet.com/3.2/rest-services/search?lang=en,de,fr&q={search_value}&qlang=cql&'
def start_requests(self):
# browser devtools -> network tab -> JSON url -> Request
payload = {
"filters": {
"publications.patent": [
{
"value": [
"true"
]
}
]
},
"query": {
"fields": [
"publications.ti_*",
"publications.abs_*",
"publications.pn_docdb",
"publications.in",
"publications.inc",
"publications.pa",
"publications.pac",
"publications.pd",
"publications.pr_docdb",
"publications.app_fdate.untouched",
"publications.ipc_ic",
"publications.ipc_icci",
"publications.ipc_iccn",
"publications.ipc_icai",
"publications.ipc_ican",
"publications.ci_cpci",
"publications.ca_cpci",
"publications.cl_cpci",
"biblio:pa;pa_orig;pa_unstd;in;in_orig;in_unstd;pac;inc;pd;pn_docdb;allKindCodes;",
"oprid_full.untouched",
"opubd_full.untouched"
],
"from": 0,
"highlighting": [
{
"field": "publications.ti_en",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.abs_en",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.ti_de",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.abs_de",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.ti_fr",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.abs_fr",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.pn_docdb",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.pa",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
}
],
"size": 20
},
"widgets": {}
}
yield scrapy.Request(url=self.api_url, headers=self.headers, method='POST', body=json.dumps(payload))
def parse(self, response):
# browser devtools -> network tab -> JSON url -> Response
json_data = response.json()
if json_data:
for hit in json_data['hits']:
if 'publications.ti_en' in hit['hits'][0]['fields']:
title = hit['hits'][0]['fields']['publications.ti_en']
yield {'title': title}
Output:
{'title': ['METHOD AND DEVICE FOR CHECKING THE DETERMINATION OF THE POSITION OF A MOBILE STATION CARRIED OUT BY A RADIO COMMUNICATION SYSTEM']}
{'title': ['Laptop']}
{'title': ['PRESENTATION LAPTOP']}
{'title': ['LAPTOP COMPUTER']}
{'title': ['Laptop comprises an integrated flat bed scanner containing a composite glass plate made from a mineral glass pane and a plastic layer']}
...
...
...
Related
I'm learning to build a scraper that scrapes search results but previously needs to log in. I read the documentation and this article here. Unfortunately, I'm still stuck. My spider reports the following <403 https://github.com/login>: HTTP status code is not handled or not allowed.
class GitHubSpider(CrawlSpider):
name = "github"
start_urls = [
"https://github.com/search?p=1&q=React+Django&type=Users",
]
rules = (
Rule(
LinkExtractor(restrict_css="a.mr-1"),
callback="parse_engineer",
),
Rule(LinkExtractor(restrict_css=".next_page")),
)
def start_requests(self):
return [
scrapy.FormRequest(
url="https://github.com/login",
formdata={
"login": "scrapy",
"password": "12345",
},
callback=self.parse,
)
]
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip(),
}
Edit: Answering on #SuperUser's suggestion.
headers = {
[...]
}
def start_requests(self):
# Do I have access on response here?
token = response.xpath('//form/input[#name="authenticity_token"]/#value').get()
return [
scrapy.FormRequest(
url="https://github.com/login",
formdata={
"login": "scrapy",
"password": "12345",
"authenticity_token": token, # <-------------
},
headers=self.headers,
callback=self.parse,
)
]
Go to settings.py and set 'ROBOTSTXT_OBEY=False'
Replace the default user_agent with another one
Add the request headers from the requested page, you can get it with your browser's devtools.
Just know that they can block your IP, and also block your account.
I suggest you to use PyGithub instead.
Edit:
The request headers:
class GitHubSpider(CrawlSpider):
name = "github"
start_urls = [
"https://github.com/search?p=1&q=React+Django&type=Users",
]
rules = (
Rule(
LinkExtractor(restrict_css="a.mr-1"),
callback="parse_engineer",
),
Rule(LinkExtractor(restrict_css=".next_page")),
)
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "github.com",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"USER_AGENT": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
}
def start_requests(self):
return [
scrapy.FormRequest(
url="https://github.com/login",
formdata={
"login": "scrapy",
"password": "12345",
},
headers=self.headers,
callback=self.parse,
)
]
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip(),
}
Also notice that you need to get the csrf token:
token = response.xpath('//form/input[#name="authenticity_token"]/#value').get()
Pass the token with the username and password.
formdata={
"login":...,
"password":...,
"authenticity_token": token,
}
I am currently using Krakend (https://krakend.io) API Gateway to proxy request to my backend service. One of my backend service API response is a redirect response with http 303. The redirect response looks like this below :
HTTP/1.1 303 See Other
content-length: 48
content-type: text/plain; charset=utf-8
date: Thu, 16 Jul 2020 10:25:41 GMT
location: https://www.detik.com/
vary: Accept
x-powered-by: Express
x-envoy-upstream-service-time: 17
server: istio-envoy
The problem is that, instead of returning the http 303 response to client (with location response header) as-is, Krakend is actually following the http redirect and return the response of the redirect Url, which is the html response of https://www.detik.com/.
My current krakend configuration looks like this below :
{
"version": 2,
"extra_config": {
"github_com/devopsfaith/krakend-cors": {
"allow_origins": [],
"expose_headers": [
"Content-Length",
"Content-Type",
"Location"
],
"allow_headers": [
"Content-Type",
"Origin",
"X-Requested-With",
"Accept",
"Authorization",
"secret",
"Host"
],
"max_age": "12h",
"allow_methods": [
"GET",
"POST",
"PUT"
]
},
"github_com/devopsfaith/krakend-gologging": {
"level": "ERROR",
"prefix": "[GATEWAY]",
"syslog": false,
"stdout": true,
"format": "default"
},
"github_com/devopsfaith/krakend-logstash": {
"enabled": false
}
},
"timeout": "10000ms",
"cache_ttl": "300s",
"output_encoding": "json",
"name": "api-gateway",
"port": 8080,
"endpoints": [
{
"endpoint": "/ramatestredirect",
"method": "GET",
"extra_config": {},
"output_encoding": "no-op",
"concurrent_calls": 1,
"backend": [
{
"url_pattern": "/",
"encoding": "no-op",
"sd": "static",
"extra_config": {},
"method": "GET",
"host": [
"http://ramatestredirect.default.svc.cluster.local"
],
"disable_host_sanitize": false
}
]
}
]
}
So how can I make krakend to return original http 303 response unaltered from my backend service to the client ?
Thank You
I assume that you're calling this endpoint /ramatestredirect
To get backend http status code (as you said it return 303 http status code), you can use this way:
{
"endpoint": "/ramatestredirect",
"method": "GET",
"extra_config": {},
"output_encoding": "no-op",
"concurrent_calls": 1,
"backend": [
{
"url_pattern": "/",
"encoding": "no-op",
"sd": "static",
"extra_config": {
"github.com/devopsfaith/krakend/http": {
"return_error_details": "authentication"
}
},
"method": "GET",
"host": [
"http://ramatestredirect.default.svc.cluster.local"
],
"disable_host_sanitize": false
}
]
}
So, basically with this plugin you can get the original backend http status code
"github.com/devopsfaith/krakend/http": {
"return_error_details": "authentication"
}
If you use Lura Framework (formerly known as Kraken framework), then you may have to disable redirects for your http client.
client := &http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
}
I use Expo, and I want to add access to my app to users which have a Google account. Then I need to get info about Google calendars of the user which login into my app.
I implement login function using: Expo.Google.logInAsync(options) (https://docs.expo.io/versions/latest/sdk/google). My scope look like this:
scopes: ['https://www.googleapis.com/auth/userinfo.email',
'https://www.googleapis.com/auth/userinfo.profile',
'https://www.googleapis.com/auth/calendar.readonly']
When someone tries to login into my app, it asks about permission to see calendars list. In response I get:
Object {
"accessToken": "a",
"idToken": "b",
"refreshToken": "c",
"serverAuthCode": "d",
"type": "success",
"user": Object {
"email": "e",
"familyName": "f",
"givenName": "g",
"id": "h",
"name": "i",
"photoUrl": "j",
},
}
I received data about the user, but I don't have any data about its calendars.
I tried to get data about calendars (https://developers.google.com/calendar/v3/reference/calendarList) with this function:
getUsersCalendarList = async (accessToken) => {
let calendarsList = await fetch('https://www.googleapis.com/calenda/v3/users/me/calendarList', {
headers: { Authorization: `Bearer ${accessToken}`},
});
return calendarsList;
}
In response I got:
Response {
"_bodyBlob": Blob {
"_data": Object {
"blobId": "67b8b161-690f-4ff6-9cee-1dce12840ebd",
"offset": 0,
"size": 994,
},
},
"_bodyInit": Blob {
"_data": Object {
"blobId": "67b8b161-690f-4ff6-9cee-1dce12840ebd",
"offset": 0,
"size": 994,
},
},
"headers": Headers {
"map": Object {
"alt-svc": Array [
"quic=\":443\"; ma=2592000; v=\"44,43,39,35\"",
],
"cache-control": Array [
"public, max-age=0",
],
"content-type": Array [
"application/json; charset=UTF-8",
],
"date": Array [
"Thu, 17 Jan 2019 11:30:32 GMT",
],
"expires": Array [
"Thu, 17 Jan 2019 11:30:32 GMT",
],
"server": Array [
"GSE",
],
"vary": Array [
"X-Origin",
],
"x-content-type-options": Array [
"nosniff",
],
"x-frame-options": Array [
"SAMEORIGIN",
],
"x-xss-protection": Array [
"1; mode=block",
],
},
},
"ok": false,
"status": 403,
"statusText": undefined,
"type": "default",
"url": "https://www.googleapis.com/calendar/v3/users/me/calendarList",
}
How can I get a list of user's google calendars in Expo?
I find the solution here: React-Native JSON fetch from URL. One needs to use json() function on returned object.
The getUsersCalendarList schulde look like this:
getUsersCalendarList = async (accessToken) => {
let calendarsList = await fetch('https://www.googleapis.com/calenda/v3/users/me/calendarList', {
headers: { Authorization: `Bearer ${accessToken}`},
});
return calendarsList.json();
}
You can also add the access token as a parameter on the request.
https://www.googleapis.com/calenda/v3/users/me/calendarList?access_token={token}
I am not a react dev so not exactly sure how to fix your header. It looks ok.
I have used nightwatch.js to automate e2e test cases for my product. It worked very well on chrome, firefox, and other UI based browser. However, I need to run it on phantom.js to run it part of Jenkins as a headless browser for automation.
I tried however test script is not working with phantom.js.
Test Script:
describe('TEST PHANTOMJS#',function() {
afterEach((client,done) => {
client.end(() => done());
});
it('successful test google.com',(client)=> {
// Launch google.com
client.url('https://www.google.com').resizeWindow(1000,800);
console.log('Launched Google')
client.expect.element('body1').to.be.present.before(1000); // test error
console.log('Completed testing')
});
});
My nightwatch.json configuration:
{
"src_folders": [
"tests"
],
"output_folder": "reports",
"custom_commands_path": "",
"custom_assertions_path": "",
"page_objects_path": "",
"selenium": {
"start_process": true,
"server_path": "./bin/selenium/selenium-server-standalone-3.0.1.jar",
"log_path": "",
"port": 4444,
"cli_args": {
"webdriver.chrome.driver": "./bin/chrome/chromedriver",
"webdriver.gecko.driver": "./bin/firefox/geckodriver",
"webdriver.edge.driver": "./bin/ie/IEDriverServer.exe"
}
},
"test_settings": {
"default": {
"selenium_port": 4444,
"selenium_host": "localhost",
"default_path_prefix": "/wd/hub",
"silent": true,
"screenshots": {
"enabled": true,
"on_failure": true,
"path": "./screen-shots"
},
"desiredCapabilities": {
"browserName": "phantomjs",
"javascriptEnabled": true,
"acceptSslCerts": true,
"phantomjs.binary.path": "./node_modules/phantomjs-prebuilt/bin/phantomjs",
"phantomjs.page.settings.userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36",
"phantomjs.cli.args": []
},
"test_runner": {
"type": "mocha",
"options": {
"ui": "bdd",
"reporter": "list"
}
}
}
}
}
After running ./node_modules/.bin/nightwatch --env qa --verbose I see following log
> nightwatch --env qa --verbose
Starting selenium server... started - PID: 11037
TEST PHANTOMJS# successful test google.com: Launched Google
Completed testing
INFO Request: POST /wd/hub/session
- data: {"desiredCapabilities":{"browserName":"phantomjs","javascriptEnabled":true,"acceptSslCerts":true,"platform":"ANY","phantomjs.binary.path":"./node_modules/phantomjs-prebuilt/bin/phantomjs","phantomjs.page.settings.userAgent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36","phantomjs.cli.args":[]}}
- headers: {"Content-Type":"application/json; charset=utf-8","Content-Length":372}
INFO Response 200 POST /wd/hub/session (1409ms) { state: null,
sessionId: 'd16c7439-18ec-4b67-85eb-e3dda6fe0075',
hCode: 1253002783,
value:
{ applicationCacheEnabled: false,
rotatable: false,
handlesAlerts: false,
databaseEnabled: false,
version: '2.1.1',
platform: 'MAC',
browserConnectionEnabled: false,
proxy: { proxyType: 'direct' },
nativeEvents: true,
acceptSslCerts: false,
driverVersion: '1.2.0',
'webdriver.remote.sessionid': 'd16c7439-18ec-4b67-85eb-e3dda6fe0075',
'phantomjs.page.settings.userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36',
locationContextEnabled: false,
webStorageEnabled: false,
browserName: 'phantomjs',
takesScreenshot: true,
driverName: 'ghostdriver',
javascriptEnabled: true,
cssSelectorsEnabled: true },
class: 'org.openqa.selenium.remote.Response',
status: 0 }
INFO Got sessionId from selenium d16c7439-18ec-4b67-85eb-e3dda6fe0075
INFO Request: POST /wd/hub/session/d16c7439-18ec-4b67-85eb-e3dda6fe0075/url
- data: {"url":"https://www.google.com"}
- headers: {"Content-Type":"application/json; charset=utf-8","Content-Length":32}
Ideally, it should complete the testing with reporting an error. However, it got stuck and doesn't move further.
Any help would be appreciated.
When you use PhantomJS on a website with HTTPS, you generally have to run your script with the ignore-ssl-errors option. Otherwise, you will often get in trouble... If your script works with all graphical browsers, but not with PhantomJS, your issue is most likely related to SSL/TLS.
In nightwatch.json, where you configure PhantomJS, make sure to add the CLI option:
"phantomjs.cli.args": ["--ignore-ssl-errors=true"]
The following script does not work without the option (it does not print the page title), but it works when you add it:
module.exports = {
'PhantomJS': function(browser) {
browser
.url('https://www.google.com')
.waitForElementVisible('body', 1000)
.getTitle(function (title) {
console.log(title);
})
.end();
}
};
I am getting status "ZERO RESULTS" when I invoke HTTP Adapter using Indian latitude and longitude to search for any service like food, malls etc.
Here is my adapter implementation:
function getGooglePlaces(location,name) {
var input = {
method : 'get',
returnedContentType : 'json',
path : 'maps/api/place/search/json',
headers: {
Host: 'maps.googleapis.com'
},
parameters : {
'key' : My Key,
'location' : location,
'radius' : '10000',
'sensor' : 'false',
'name' : name
}
};
var response = WL.Server.invokeHttp(input);
return response;
}
}
The JSON response for indian coordinates.
results is the array of places searched
{
"html_attributions": [
],
"isSuccessful": true,
"responseHeaders": {
"Alternate-Protocol": "443:quic",
"Cache-Control": "public, max-age=300",
"Content-Type": "application\/json; charset=UTF-8",
"Date": "Wed, 12 Feb 2014 15:06:33 GMT",
"Expires": "Wed, 12 Feb 2014 15:11:33 GMT",
"Server": "mafe",
"Transfer-Encoding": "chunked",
"Vary": "Accept-Language",
"X-Frame-Options": "SAMEORIGIN",
"X-XSS-Protection": "1; mode=block"
},
"responseTime": 609,
"results": [
],
"status": "ZERO_RESULTS",
"statusCode": 200,
"statusReason": "OK",
"totalTime": 609
}
I have used the following and got a valid response, using a Worklight adapter.
Note that I passed name and location as part of the parameters object and not like you did (just for the sake of simplification)
I added a type parameter for "restaurant"
I increased the radius
With the above, I got results.
Looks to me like this is a matter of tweaking the request parameters. Nothing related to Worklight adapters here, as it works.
https://www.google.com/search?q=google+places+returns+zero+results
GooglePlaces-impl.js
function getPlaces() {
var input = {
method : 'get',
returnedContentType : 'json',
path : 'maps/api/place/search/json',
headers: {
Host: 'maps.googleapis.com'
},
parameters : {
'key' : 'make-sure-to-place-here-your-SERVER-KEY-generated-by-Google-GCM-console',
'location' : '27.173033, 78.042133',
'radius' : '30000',
'sensor' : 'false',
'name' : 'Taj Mahal',
'type' : 'restaurant',
}
};
var response = WL.Server.invokeHttp(input);
return response;
}
Procedure response
{
"html_attributions": [
],
"isSuccessful": true,
"responseHeaders": {
"Alternate-Protocol": "443:quic",
"Cache-Control": "public, max-age=300",
"Content-Type": "application\/json; charset=UTF-8",
"Date": "Fri, 14 Mar 2014 05:45:41 GMT",
"Expires": "Fri, 14 Mar 2014 05:50:41 GMT",
"Server": "mafe",
"Transfer-Encoding": "chunked",
"Vary": "Accept-Language",
"X-Frame-Options": "SAMEORIGIN",
"X-XSS-Protection": "1; mode=block"
},
"responseTime": 662,
"results": [
{
"geometry": {
"location": {
"lat": 27.175015,
"lng": 78.042155
}
},
"icon": "http:\/\/maps.gstatic.com\/mapfiles\/place_api\/icons\/generic_business-71.png",
"id": "dcc9586b99ab0f0471fccecbe2dbb40fdc1fc2b5",
"name": "Taj Mahal",
"photos": [
{
"height": 853,
"html_attributions": [
],
"photo_reference": "CnRnAAAAojrNNKpH2yqjmiph_kNBxiT5DaK_g3N05YtE_mPP5tBrYD8XjyAAz_xEvWAJymfGOeeOTlVgzFUxUeOMQspvoPGogDQyWomDeZsNP7XEW3JsmzmYakDk_vyJBwajiamhluypx6rCDqDCnBWb6JnCLBIQRuBibftSN9xruu2eeZEm5xoUksG5kyFbtZULWpZceNvNhyl72tQ",
"width": 1280
}
],
"rating": 4.6,
"reference": "CnRrAAAA8n6I_Dnlm9UzwTiaTntjcxR-FysL5Ya26Fdcsdb48XOIxiJDGdd3AiK6iUUti41d1BQ1XnBfZoVMKWZ5QOyVZAW8QyH-xqSY8eaQXuxH0atjzXtuaplht-ww76JtbxQLkJ4SUtFrmrs7ZjmZn-_RhBIQmYTB0_yGd_4hm2bHoIKt5xoULaBq-FsZo51jFdxLV377nHM0cCI",
"types": [
"establishment"
],
"vicinity": "Agra"
},
...
...
"status": "OK",
"statusCode": 200,
"statusReason": "OK",
"totalTime": 1088
}