Scrapy extract all text into one cell how to divide? - scrapy

I am trying to scrape some tweets but when i do in scrapy all tweets goes into 1 cell in csv file. How can i make it extract each tweet into 1 individual cell !
my code is:
class tiktokSpider(scrapy.Spider):
name = 'twitter'
allowed_domains = ['twitter.com']
start_urls = ['http://twitter.com/']
allowed_domains = ['app.scrapingbee.com']
custom_settings = {'CONCURRENT_REQUESTS_PER_DOMAIN': 10}
custom_settings = {'FEEDS':{'poststoday.csv':{'format':'csv'}}}
def start_requests(self):
for username in user_accounts:
url = f'https://app.scrapingbee.com/api/v1/?api_key=hiddenkey&url=https://twitter.com/{username}&js_scroll=true&js_scroll_count=3&premium_proxy=true&country_code=us'
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
items = TwitterproItem()
full_name = response.css('.r-135wba7.r-1udh08x .r-qvutc0 .r-qvutc0::text').extract()
tweet_text = response.xpath('//*[contains(#class,"css-901oao r-18jsvk2")]//text()').extract()
items['full_name'] = full_name
items['tweet_text'] = tweet_text
yield items
any suggestions? Thank you

First, check the values of full_name and tweet_text using self.logger.info(full_name),
If full_name is a list then replace :
items['full_name'] = full_name
items['tweet_text'] = tweet_text
yield items
with :
for (name, text) in zip(full_name, tweet_text):
item['full_name'] = name
item['tweet_text] = text
yield item

Related

Scraper not grabbing all text in list/div

My scraper seems to skip on information, for example I want to extract all the countries and the highway codes belonging to each country. However, when I iterate over this I only get one country and one highway code for each list of numbered highways.
How do I get all the highway routes and their respective countries?
Here's the scraper that I am working with:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose
from itemloaders import ItemLoader
class roadItem(scrapy.Item):
highway_names = Field(output_processor = TakeFirst())
country = Field(output_processor = TakeFirst())
route_name = Field(output_processor = TakeFirst())
class roadSpider(scrapy.Spider):
name = "road"
start_urls = ["https://en.wikipedia.org/w/index.php?title=Category:Lists_of_roads_sharing_the_same_title&pageuntil=092%0AList+of+highways+numbered+92#mw-pages"]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
list_data = response.xpath('(//div[#class="mw-category-group"][last()])[2]//ul')
for items in list_data:
for link in items.xpath(".//a/#href").getall():
yield response.follow(
response.urljoin(link),
callback = self.parse_roads
)
next_page = response.xpath("//div[#id='mw-pages']/a[1]//#href").get()
if next_page is not None:
yield response.follow(
response.urljoin(next_page),
callback = self.parse
)
def parse_roads(self, response):
loader = ItemLoader(roadItem())
loader.add_value("highway_names", response.xpath("//h1[#id='firstHeading']//text()").get())
data = response.xpath("//div[#id='mw-content-text']")
for list_h2 in data:
if list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get():
loader.add_value("country", list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get())
else:
loader.add_value("country", list_h2.xpath("(//h2//span)[1]//text()").get())
if list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
for routes in list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
loader.add_value('route_name', routes)
else:
for routess in list_h2.xpath("//ul//li//#title").getall():
loader.add_value('route_name', routess)
yield loader.load_item()

scrapy pagination not working on tripadvisor

i'm trying to scrape the restaurant pages on tripadvisor (just to learn how it works)
However, i only get the first page.
What am I missing?
here is the code, thanks!
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name' : stores.css('a.bHGqj::text').extract(),
'link' : stores.css('a.bHGqj').xpath("#href").extract()
}
next_page = ('http://tripadvisor.com' + response.css('a.nav').attrib['href']).extract()
##next_page = response.xpath('//a[contains(text(), "Next")]/#href).extract())
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
#djmystica, Now it's working fine
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = [
'https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name': stores.css('a.bHGqj::text').extract_first(),
'link': stores.css('a.bHGqj').xpath("#href").extract_first()}
#next_page = ('http://tripadvisor.com' +response.css('a.nav').attrib['href']).extract()
next_page = response.xpath('//a[contains(text(), "Next")]/#href').extract_first()
abs_next_page = f'https://www.tripadvisor.com{next_page}'
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if abs_next_page is not None:
yield response.follow(abs_next_page, callback=self.parse)

Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.
It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]

lambda in start_pares in scrapy

Could anyone tell me why the number of index variable in parse() is 10013 at all time?
class GetsourcesSpider(scrapy.Spider):
name = 'getSources'
allowed_domains = ['bizhi.feihuo.com']
base_url = 'http://bizhi.feihuo.com/wallpaper/share?rsid={index}/'
def start_requests(self):
for index in range(10010, 10014):#11886
yield scrapy.Request(url=self.base_url.format(index=index), callback=lambda response:self.parse(response,index))
def parse(self, response, index):
video_label = response.xpath('//video')[0]
item = DynamicdesktopItem()
item['index'] = index # response.url[-6:-1]
item['video'] = video_label.attrib['src']
item['image'] = video_label.attrib['poster']
yield item
That is because you are giving the index variable reference and not a value, that's why you get the last value. You need to use meta object for the same. Please see the updated code below
class GetsourcesSpider(scrapy.Spider):
name = 'getSources'
allowed_domains = ['bizhi.feihuo.com']
base_url = 'http://bizhi.feihuo.com/wallpaper/share?rsid={index}/'
def start_requests(self):
for index in range(10010, 10014):#11886
yield scrapy.Request(url=self.base_url.format(index=index), callback=self.parse, meta = {'index': index})
def parse(self, response):
index = response.meta['index']
video_label = response.xpath('//video')[0]
item = DynamicdesktopItem()
item['index'] = index # response.url[-6:-1]
item['video'] = video_label.attrib['src']
item['image'] = video_label.attrib['poster']
yield item
Because index variable referred to from all lambdas is not copied to their local scope. It is rewritten on each next loop iteration.
Consider this snippet:
lambdas = []
for i in range(3):
lambdas.append(lambda: print(i))
for fn in lambdas:
fn()
this will print three 2's, the last value of i.
Instead of doing lambda callbacks you should utilize meta= keyword of a Request class:
https://doc.scrapy.org/en/latest/topics/request-response.html#request-meta-special-keys

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item