Trying to download files without starting scrapy project but from .py file. Created Custom pipeline within python file, This error comes as metioned - scrapy

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.pipelines.files import FilesPipeline
from urllib.parse import urlparse
import os
class DatasetItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
class MyFilesPipeline(FilesPipeline):
pass
class DatasetSpider(scrapy.Spider):
name = 'Dataset_Scraper'
url = 'https://kern.humdrum.org/cgi-bin/browse?l=essen/europa/deutschl/allerkbd'
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 7.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
custom_settings = {
'FILES_STORE': 'Dataset',
'ITEM_PIPELINES':{"/home/LaxmanMaharjan/dataset/MyFilesPipeline":1}
}
def start_requests(self):
yield scrapy.Request(
url = self.url,
headers = self.headers,
callback = self.parse
)
def parse(self, response):
item = DatasetItem()
links = response.xpath('.//body/center[3]/center/table/tr[1]/td/table/tr/td/a[4]/#href').getall()
for link in links:
item['file_urls'] = [link]
yield item
break
if __name__ == "__main__":
#run spider from script
process = CrawlerProcess()
process.crawl(DatasetSpider)
process.start()
Error : Error loading object home-LaxmanMaharjan-dataset-Pipeline': not a full path
path is correct
How do i use custom file pipeline within this python file??? Help
I am trying to add custom file pipeline to download files with proper name. I cannot mention file pipeline class name cause it requires path so when entered path above error comes.

In case if pipeline code, spider code and process launcher stored in the same file
You can use __main__ in path to enable pipeline:
custom_settings = {
'FILES_STORE': 'Dataset',
'ITEM_PIPELINES':{"__main__.MyFilesPipeline":1}
}

Related

Passing google login cookies from scrapy splash to selenium

I want to sign in to my Google Account and enable a Google API and extract the developer's key. My main task is to automate this process.
Everyone knows that you can't log into the Google Account using an automated browser. I did manage to do that using scrapy splash.
import re
import time
import base64
import scrapy
from scrapy_splash import SplashRequest
from selenium import webdriver
class GoogleScraperSpider(scrapy.Spider):
name = 'google_scraper'
script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
local url = splash.args.url
local youtube_url = "https://console.cloud.google.com/apis/library/youtube.googleapis.com"
assert(splash:go(url))
assert(splash:wait(1))
splash:set_viewport_full()
local search_input = splash:select('.whsOnd.zHQkBf')
search_input:send_text("xxxxxxxxxxx#gmail.com")
assert(splash:wait(1))
splash:runjs("document.getElementById('identifierNext').click()")
splash:wait(5)
local search_input = splash:select('.whsOnd.zHQkBf')
search_input:send_text("xxxxxxxx")
assert(splash:wait(1))
splash:runjs("document.getElementById('passwordNext').click()")
splash:wait(5)
return {
cookies = splash:get_cookies(),
html = splash:html(),
png = splash:png()
}
end
"""
def start_requests(self):
url = 'https://accounts.google.com'
yield SplashRequest(url, self.parse, endpoint='execute', session_id="1", args={'lua_source': self.script})
def parse(self, response):
imgdata = base64.b64decode(response.data['png'])
with open('image.png', 'wb') as file:
file.write(imgdata)
cookies = response.data.get("cookies")
driver = webdriver.Chrome("./chromedriver")
for cookie in cookies:
if "." in cookie["domain"][:1]:
url = f"https://www{cookie['domain']}"
else:
url = f"https://{cookie['domain']}"
driver.get(url)
driver.add_cookie(cookie)
driver.get("https://console.cloud.google.com/apis/library/youtube.googleapis.com")
time.sleep(5)
In the parse function I'm trying to retrieve those cookies and add them to my chromedriver to bypass the login process so I can move ahead to enabling the API and extracting the key but I always face the login page in the chromedriver.
Your help would be most appreciated.
Thanks.
try using pickle to save cookies instead, just use any python console to save the cookies with this code
import pickle
input('press enter when logged')
pickle.dump(driver.get_cookies(), open('cookies.pkl'))
then you get the cookies.pkl file with google login data, import it in your code using:
import pickle
cookies = pickle.load(open('cookies.pkl'))
for cookie in cookies:
driver.add_cookies(cookie)
driver.refresh()
# rest of work here...
refresh the driver to enable the cookies

Fetch html and image from URL in python: HTTP Error 403 or Cloudflare-protected page with captcha

I want to get html code from URL with python script. when I use 'urllib' library, It is works for many sites but in my specific URL case, I get 'HTTP Error 403: Forbidden'. Here is Example I having problem with it:
from urllib.request import Request, urlopen, urlretrieve
url='https://bama.ir/car/detail-szx9y9u-hyundai-sonata-hybrid-gls-plus-2017'
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = Request(url, headers=header)
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
print(webpage)
Download Image directly with python script in this specific url has same error. Example of that:
link="https://cdn.bama.ir/uploads/BamaImages/VehicleCarImages/6d364e82-f39d-419a-b346-257014928907/CarImage9557638_2_thumb_900_600.jpeg"
name=link.rsplit("/",1)[1]
urlretrieve(link,name)
When I use 'BeautifulSoup' or 'cloudscraper' or 'urllib3' library, Cloudflare-protected page with captcha is received. Here Example of that:
url="https://bama.ir/car/detail-grxi644n-hyundai-genesis-coupe-2011"
from bs4 import BeautifulSoup as bs
import requests
soup = bs(requests.get(url).content, "html.parser")
print(soup.text)
import cloudscraper
scraper = cloudscraper.create_scraper()
print(scraper.get(url).text)
import urllib3
http = urllib3.PoolManager()
r = http.request("GET",url)
print(r.data)
When I use 'selenium' library, sometimes it is working but sometimes Cloudflare-protected page with captcha appearing. For downloading Image I just have to use screenshot function. Here Example of that:
from selenium import webdriver
url="https://cdn.bama.ir/uploads/BamaImages/VehicleCarImages/a36114cd-1978-41a4-a558-cbe5f652faf1/CarImage9473771_0_1_thumb_900_600.jpg"
driver = webdriver.Chrome('chromedriver.exe')
driver.get(url)
html = driver.page_source
driver.save_screenshot("screenshot.png")
driver.close()

Is there a way to run code after reactor.run() in scrapy?

I am working on a scrapy api. One of my issues was that the twisted reactor wasn't restartable. I fixed this using crawl runner as opposed to crawl process. My spider extracts links from a website, validates them. My issue is that if I add the validation code after reactor.run() it doesn't work. This is my code:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
from urllib.parse import urlparse
list = set([])
list_validate = set([])
runner = CrawlerRunner()
class Crawler(CrawlSpider):
name = "Crawler"
start_urls = ['https:www.example.com']
allowed_domains = ['www.example.com']
rules = [Rule(LinkExtractor(), callback='parse_links', follow=True)]
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
def parse_links(self, response):
base_url = url
href = response.xpath('//a/#href').getall()
list.add(urllib.parse.quote(response.url, safe=':/'))
for link in href:
if base_url not in link:
list.add(urllib.parse.quote(response.urljoin(link), safe=':/'))
for link in list:
if base_url in link:
list_validate.add(link)
runner.crawl(Crawler)
reactor.run()
If add the code that validates the links after reactor.run(), it doesn't get executed. And if I put the code before reactor.run(), nothing happens because the spider hasn't yet finished crawling all the links. What should I do? The code that validates the links is perfectly fine I used it before and it works.
We can do that with d.addCallback(<callback_function>) and d.addErrback(<errback_function>)
...
runner = CrawlerRunner()
d = runner.crawl(MySpider)
def finished(d):
print("finished :D")
def spider_error(err):
print("Spider error :/")
d.addCallback(finished)
d.addErrback(spider_error)
reactor.run()
For your ScraperApi you can use Klein.
Klein is a micro-framework for developing production-ready web services with Python. It is 'micro' in that it has an incredibly small API similar to Bottle and Flask.
...
import scrapy
from scrapy.crawler import CrawlerRunner
from klein import Klein
app=Klein()
#app.route('/')
async def hello(request):
status=list()
class TestSpider(scrapy.Spider):
name='test'
start_urls=[
'https://quotes.toscrape.com/',
'https://quotes.toscrape.com/page/2/',
'https://quotes.toscrape.com/page/3/',
'https://quotes.toscrape.com/page/4/'
]
def parse(self,response):
"""
parse
"""
status.append(response.status)
runner=CrawlerRunner()
d= await runner.crawl(TestSpider)
content=str(status)
return content
#app.route('/h')
def index(request):
return 'Index Page'
app.run('localhost',8080)

How to add Proxy to Scrapy and Selenium Script

I would like to add a proxy to my script.
How do I have to do it? Do I have to use Selenium or Scrapy for it?
I think that Scrapy is making the initial request, so it would make sense to use scrapy for it. But what exactly do I have to do?
Can you recommend any proxylist which works quite reliable?
This is my current script:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import re
import csv
from time import sleep
class PostsSpider(Spider):
name = 'posts'
allowed_domains = ['xyz']
start_urls = ('xyz',)
def parse(self, response):
with open("urls.txt", "rt") as f:
start_urls = [url.strip() for url in f.readlines()]
for url in start_urls:
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
self.driver.get(url)
try:
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
except NoSuchElementException:
self.logger.info('Blog does not exist anymore')
while True:
try:
element = self.driver.find_element_by_id('last_item')
self.driver.execute_script("arguments[0].scrollIntoView(0, document.documentElement.scrollHeight-5);", element)
sleep(3)
self.driver.find_element_by_id('last_item').click()
sleep(7)
except NoSuchElementException:
self.logger.info('No more tipps')
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield {'Username': username,
'Publish date': publish_date}
self.driver.close()
break
One simple way is to set the http_proxy and https_proxy environment variables.
You could set them in your environment before starting your script, or maybe add this at the beginning of your script:
import os
os.environ['http_proxy'] = 'http://my/proxy'
os.environ['https_proxy'] = 'http://my/proxy'
For a list of publicly available proxy, you will find a ton if you just search in Google.
You should read Scrapy ProxyMiddleware to explore it to best. Ways of using mentioned proxies are also mentioned in it

How to access Scrapy httpcache middleware data directly

How could I access the httpcache middleware from scrapy directly?
Something like such in pseudo code
URL = 'http://scrapedsite.com/category1/item1'
print retrieveRawHtml(URL)
from scrapy.utils.response import open_in_browser
from scrapy.http import HtmlResponse
url = 'http://scrapedsite.com/category1/item1'
body = '<html>hello</html>'
response = HtmlResponse(url, body=body)
open_in_browser(response)
or from your callback:
def parse_cb(self, response):
from scrapy.utils.response import open_in_browser
open_in_browser(response)
If caching is turned on it will pull from cache.