Scrapy pull data from table rows - scrapy

I'm trying to pull data from this page using Scrapy: https://www.interpol.int/notice/search/woa/1192802
The spider will crawl multiple pages but I have excluded the pagination code here to keep things simple. The problem is that the number of table rows that I want to scrape on each page can change each time.
So I need a way of scraping all the table data from the page no matter how many table rows it has.
First, I extracted all the table rows on the page. Then, I created a blank dictionary. Next, I tried to loop through each row and put it's cell data into the dictionary.
But it does not work and it is returning a blank file.
Any idea what's wrong?
# -*- coding: utf-8 -*-
import scrapy
class Test1Spider(scrapy.Spider):
name = 'test1'
allowed_domains = ['interpol.int']
start_urls = ['https://www.interpol.int/notice/search/woa/1192802']
def parse(self, response):
table_rows = response.xpath('//*[contains(#class,"col_gauche2_result_datasheet")]//tr').extract()
data = {}
for table_row in table_rows:
data.update({response.xpath('//td[contains(#class, "col1")]/text()').extract(): response.css('//td[contains(#class, "col2")]/text()').extract()})
yield data

What is this?
response.css('//td[contains(#class, "col2")]/text()').extract()
You are calling css() method but you are giving it a xpath
Anyways, here is the 100% working code, I have tested it.
table_rows = response.xpath('//*[contains(#class,"col_gauche2_result_datasheet")]//tr')
data = {}
for table_row in table_rows:
data[table_row.xpath('td[#class="col1"]/text()').extract_first().strip()] = table_row.xpath('td[#class="col2 strong"]/text()').extract_first().strip()
yield data
EDIT:
To remove the characters like \t\n\r etc, use regex.
import re
your_string = re.sub('\\t|\\n|\\r', '', your_string)

Try this.
I Hope it will help you.
# -*- coding: utf-8 -*-
import scrapy
class Test(scrapy.Spider):
name = 'test1'
allowed_domains = ['interpol.int']
start_urls = ['https://www.interpol.int/notice/search/woa/1192802']
def parse(self, response):
table_rows = response.xpath('//*[contains(#class,"col_gauche2_result_datasheet")]//tr')
for table_row in table_rows:
current_row = table_row.xpath('.//td/text()').extract()
print(current_row[0] + current_row[1].strip())

Related

Scrapy Project Review/ Link Rules

This is my second project and I was wondering if someone could review and give me best practices in applying scrapy framework. I also have a specific issue: not all courses are scraped from the site.
Goal: scrape all golf courses info from golf advisor website. Link: https://www.golfadvisor.com/course-directory/1-world/
Approach: I used CrawlSpider to include rules for links to explore.
Result: Only 19,821 courses out of 36,587 were scraped from the site.
Code:
import scrapy
from urllib.parse import urljoin
from collections import defaultdict
# adding rules with crawlspider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class GolfCourseSpider(CrawlSpider):
name = 'golfadvisor'
allowed_domains = ['golfadvisor.com']
start_urls = ['https://www.golfadvisor.com/course-directory/1-world/']
base_url = 'https://www.golfadvisor.com/course-directory/1-world/'
# use rules to visit only pages with 'courses/' in the path and exclude pages with 'page=1, page=2, etc'
# since those are duplicate links to the same course
rules = [
Rule(LinkExtractor(allow=('courses/'), deny=('page=')), callback='parse_filter_course', follow=True),
]
def parse_filter_course(self, response):
# checking if it is an actual course page. excluded it for final ran, didnt fully
# exists = response.css('.CoursePageSidebar-map').get()
# if exists:
# the page is split in multiple sections with different amount of details specified on each.
# I decided to use nested for loop (for section in sections, for detail in section) to retrieve data.
about_section = response.css('.CourseAbout-information-item')
details_section = response.css('.CourseAbout-details-item')
rental_section = response.css('.CourseAbout-rentalsServices-item')
practice_section = response.css('.CourseAbout-practiceInstruction-item')
policies_section = response.css('.CourseAbout-policies-item')
sections = [
about_section,
details_section,
rental_section,
practice_section,
policies_section
]
# created a default list dict to add new details from for loops
dict = defaultdict(list)
# also have details added NOT from for loop sections, but hard coded using css and xpath selectors.
dict = {
'link': response.url,
'Name': response.css('.CoursePage-pageLeadHeading::text').get().strip(),
'Review Rating': response.css('.CoursePage-stars .RatingStarItem-stars-value::text').get('').strip(),
'Number of Reviews': response.css('.CoursePage-stars .desktop::text').get('').strip().replace(' Reviews',''),
'% Recommend this course': response.css('.RatingRecommendation-percentValue::text').get('').strip().replace('%',''),
'Address': response.css('.CoursePageSidebar-addressFirst::text').get('').strip(),
'Phone Number': response.css('.CoursePageSidebar-phoneNumber::text').get('').strip(),
# website has a redirecting link, did not figure out how to get the main during scraping process
'Website': urljoin('https://www.golfadvisor.com/', response.css('.CoursePageSidebar-courseWebsite .Link::attr(href)').get()),
'Latitude': response.css('.CoursePageSidebar-map::attr(data-latitude)').get('').strip(),
'Longitude': response.css('.CoursePageSidebar-map::attr(data-longitude)').get('').strip(),
'Description': response.css('.CourseAbout-description p::text').get('').strip(),
# here, I was suggested to use xpath to retrieve text. should it be used for the fields above and why?
'Food & Beverage': response.xpath('//h3[.="Available Facilities"]/following-sibling::text()[1]').get('').strip(),
'Available Facilities': response.xpath('//h3[.="Food & Beverage"]/following-sibling::text()[1]').get('').strip(),
# another example of using xpath for microdata
'Country': response.xpath("(//meta[#itemprop='addressCountry'])/#content").get('')
}
# nested for loop I mentioned above
for section in sections:
for item in section:
dict[item.css('.CourseValue-label::text').get().strip()] = item.css('.CourseValue-value::text').get('').strip()
yield dict
E.G. it discovered only two golf courses in Mexico:
Club Campestre de Tijuana
Real del Mar Golf Resort
I've ran the code specifically scraping the pages it didn't pick up: I was able to scrapy those pages individually. Therefore my link extraction rules are wrong.
This is the output file with ~20k courses: https://drive.google.com/file/d/1izg2gZ87qbmMtg4S_VKQmkzlKON3poIs/view?usp=sharing
Thank you,
Yours Data Enthusiast

How do I find a specific tag's value (which could be anything) with beautifulsoup?

I am trying to get the job IDs from the tags of Indeed listings. So far, I have taken Indeed search results and put each job into its own "bs4.element.Tag" object, but I don't know how to extract the value of the tag (or is it a class?) "data-jk". Here is what I have so far:
import requests
import bs4
import re
# 1: scrape (5?) pages of search results for listing ID's
results = []
results.append(requests.get("https://www.indeed.com/jobs?q=data+analyst&l=United+States&start=0"))
results.append(requests.get("https://www.indeed.com/jobs?q=data+analyst&l=United+States&start=10"))
results.append(requests.get("https://www.indeed.com/jobs?q=data+analyst&l=United+States&start=20"))
results.append(requests.get("https://www.indeed.com/jobs?q=data+analyst&l=United+States&start=30"))
results.append(requests.get("https://www.indeed.com/jobs?q=data+analyst&l=United+States&start=40"))
# each search page has a query "q", location "l", and a "start" = 10*int
# the search results are contained in a "td" with ID = "resultsCol"
justjobs = []
for eachResult in results:
soup_jobs = bs4.BeautifulSoup(eachResult.text, "lxml") # this is for IDs
justjobs.extend(soup_jobs.find_all(attrs={"data-jk":True})) # re.compile("data-jk")
# each "card" is a div object
# each has the class "jobsearch-SerpJobCard unifiedRow row result clickcard"
# as well as a specific tag "data-jk"
# "data-jk" seems to be the actual IDs used in each listing's URL
# Now, each div element has a data-jk. I will try to get data-jk from each one:
jobIDs = []
print(type(justjobs[0])) # DEBUG
for eachJob in justjobs:
jobIDs.append(eachJob.find("data-jk"))
print("Length: " + str(len(jobIDs))) # DEBUG
print("Example JobID: " + str(jobIDs[1])) # DEBUG
The examples I've seen online generally try to get the information contained between and , but I am not sure how to get the info from inside of the (first) tag itself. I've tried doing it by parsing it as a string instead:
print(justjobs[0])
for eachJob in justjobs:
jobIDs.append(str(eachJob)[115:131])
print(jobIDs)
but the website is also inconsistent with how the tags operate, and I think that using beautifulsoup would be more flexible than multiple cases and substrings.
Any pointers would be greatly appreciated!
Looks like you can regex them out from a script tag
import requests,re
html = requests.get('https://www.indeed.com/jobs?q=data+analyst&l=United+States&start=0').text
p = re.compile(r"jk:'(.*?)'")
ids = p.findall(html)

Python Dynamic Test Plan generation

I am using Sphinx for documentation and pytest for testing.
I need to generate a test plan but I really don't want to generate it by hand.
It occurred to me that a neat solution would be to actually embed test metadata in the tests' themselves, within their respective docstrings. This metadata would include things like % complete, time remaining etc. I could then run through all of the tests (which would at this point include mostly placeholders) and generate a test plan from them. This would then guarantee that the test plan and the tests themselves would be in sync.
I was thinking of making either a pytest plugin or a sphinx plugin to handle this.
Using pytest, the closest hook I can see looks like pytest_collection_modifyitems which gets called after all of the tests are collected.
Alternatively, I was thinking of using Sphinx and perhaps copying/modifying the todolist plugin as it seems like the closest match to this idea. The output of this would be more useful as the output would slot nicely in to the existing Sphinx based docs I have though there is a lot going on in this plugin and I don't really have the time to invest in understanding it.
The docstrings could have something like this within it:
:plan_complete: 50 #% indicator of how complete this test is
:plan_remaining: 2 #the number of hours estimated to complete this test
:plan_focus: something #what is the test focused on testing
The idea is to then generate a simple markdown/rst or similar table based on the function's name, docstring and embedded plan info and use that as the test plan.
Does something like this already exist?
In the end I went with a pytest based plugin as it was just so much simpler to code.
If anyone else is interested, below is the plugin:
"""Module to generate a test plan table based upon metadata extracted from test
docstrings. The test description is extracted from the first sentence or up to
the first blank line. The data which is extracted from the docstrings are of the
format:
:test_remaining: 10 #number of hours remaining for this test to be complete. If
not present, assumed to be 0
:test_complete: #the percentage of the test that is complete. If not
present, assumed to be 100
:test_focus: The item the test is focusing on such as a DLL call.
"""
import pytest
import re
from functools import partial
from operator import itemgetter
from pathlib import Path
whitespace_re = re.compile(r'\s+')
cut_whitespace = partial(whitespace_re.sub, ' ')
plan_re = re.compile(r':plan_(\w+?):')
plan_handlers = {
'remaining': lambda x:int(x.split('#')[0]),
'complete': lambda x:int(x.strip().split('#')[0]),
'focus': lambda x:x.strip().split('#')[0]
}
csv_template = """.. csv-table:: Test Plan
:header: "Name", "Focus", "% Complete", "Hours remaining", "description", "path"
:widths: 20, 20, 10, 10, 60, 100
{tests}
Overall hours remaining: {hours_remaining:.2f}
Overall % complete: {complete:.2f}
"""
class GeneratePlan:
def __init__(self, output_file=Path('test_plan.rst')):
self.output_file = output_file
def pytest_collection_modifyitems(self, session, config, items):
#breakpoint()
items_to_parse = {i.nodeid.split('[')[0]:i for i in self.item_filter(items)}
#parsed = map(parse_item, items_to_parse.items())
parsed = [self.parse_item(n,i) for (n,i) in items_to_parse.items()]
complete, hours_remaining = self.get_summary_data(parsed)
self.output_file.write_text(csv_template.format(
tests = '\n'.join(self.generate_rst_table(parsed)),
complete=complete,
hours_remaining=hours_remaining))
def item_filter(self, items):
return items #override me
def get_summary_data(self, parsed):
completes = [p['complete'] for p in parsed]
overall_complete = sum(completes)/len(completes)
overall_hours_remaining = sum(p['remaining'] for p in parsed)
return overall_complete, overall_hours_remaining
def generate_rst_table(self, items):
"Use CSV type for simplicity"
sorted_items = sorted(items, key=lambda x:x['name'])
quoter = lambda x:'"{}"'.format(x)
getter = itemgetter(*'name focus complete remaining description path'.split())
for item in sorted_items:
yield 3*' ' + ', '.join(map(quoter, getter(item)))
def parse_item(self, path, item):
"Process a pytest provided item"
data = {
'name': item.name.split('[')[0],
'path': path.split('::')[0],
'description': '',
'remaining': 0,
'complete': 100,
'focus': ''
}
doc = item.function.__doc__
if doc:
desc = self.extract_description(doc)
data['description'] = desc
plan_info = self.extract_info(doc)
data.update(plan_info)
return data
def extract_description(self, doc):
first_sentence = doc.split('\n\n')[0].replace('\n',' ')
return cut_whitespace(first_sentence)
def extract_info(self, doc):
plan_info = {}
for sub_str in doc.split('\n\n'):
cleaned = cut_whitespace(sub_str.replace('\n', ' '))
splitted = plan_re.split(cleaned)
if len(splitted) > 1:
i = iter(splitted[1:]) #splitter starts at index 1
while True:
try:
key = next(i)
val = next(i)
except StopIteration:
break
assert key
if key in plan_handlers:
plan_info[key] = plan_handlers[key](val)
return plan_info
From my conftest.py file, I have a command line argument configured within a pytest_addoption function: parser.addoption('--generate_test_plan', action='store_true', default=False, help="Generate test plan")
And I then configure the plugin within this function:
def pytest_configure(config):
output_test_plan_file = Path('docs/source/test_plan.rst')
class CustomPlan(GeneratePlan):
def item_filter(self, items):
return (i for i in items if 'tests/hw_regression_tests' in i.nodeid)
if config.getoption('generate_test_plan'):
config.pluginmanager.register(CustomPlan(output_file=output_test_plan_file))
#config.pluginmanager.register(GeneratePlan())
Finally, in one of my sphinx documentation source files I then just include the output rst file:
Autogenerated test_plan
=======================
The below test_data is extracted from the individual tests in the suite.
.. include:: test_plan.rst
We have done something similar in our company by using Sphinx-needs and Sphinx-Test-Reports.
Inside a test file we use the docstring to store our test-case incl meta-data:
def my_test():
"""
.. test:: My test case
:id: TEST_001
:status: in progress
:author: me
This test case checks for **awesome** stuff.
"""
a = 2
b = 5
# ToDo: chek if a+b = 7
Then we document the test cases by using autodoc.
My tests
========
.. automodule:: test.my_tests:
:members:
This results in some nice test-case objects in sphinx, which we can filter, link and present in table and flowcharts. See Sphinx-Needs.
With Sphinx-Test-Reports we are loading the results into the docs as well:
.. test-report: My Test report
:id: REPORT_1
:file: ../pytest_junit_results.xml
:links: [[tr_link('case_name', 'signature')]]
This will create objects for each test case, which we also can filter and link.
Thanks of tr_link the result objects get automatically linked to the test case objects.
After that we have all needed information in sphinx and can use e.g. .. needtable:: to get custom views on it.

Flask button to save table from query as csv

I have a flask app that runs a query and returns a table. I would like to provide a button on the page so the user can export the data as a csv.
The problem is that the query is generated dynamically based on form input.
#app.route('/report/<int:account_id>', methods=['GET'])
def report(account_id):
if request == 'GET':
c = g.db.cursor()
c.execute('SELECT * FROM TABLE WHERE account_id = :account_id', account_id=account_id)
entries = [dict(title=row[0], text=row[1]) for row in c.fetchall()]
return render_template('show_results.html', entries=entries)
On the html side it's just a simple table, looping over the rows and rendering them. I'm using bootstrap for styling, and included a tablesorter jquery plugin. None of this is really consequential. I did try one javascript exporter I found, but since my content is rendered dynamically, it saves a blank CSV.
Do I need to do some ajax-style trickery to grab a csv object from the route?
I solved this myself. For anyone who comes across this I find it valuable for the specific use case within flask. Here's what I did.
import cx_Oracle # We are an Oracle shop, and this changes some things
import csv
import StringIO # allows you to store response object in memory instead of on disk
from flask import Flask, make_response # Necessary imports, should be obvious
#app.route('/export/<int:identifier>', methods=['GET'])
def export(load_file_id):
si = StringIO.StringIO()
cw = csv.writer(si)
c = g.db.cursor()
c.execute('SELECT * FROM TABLE WHERE column_val = :identifier', identifier=identifier)
rows = c.fetchall()
cw.writerow([i[0] for i in c.description])
cw.writerows(rows)
response = make_response(si.getvalue())
response.headers['Content-Disposition'] = 'attachment; filename=report.csv'
response.headers["Content-type"] = "text/csv"
return response
For anyone using flask with sqlalchemy, here's an adjustment to tadamhicks answer, also with a library update:
import csv
from io import StringIO
from flask import make_response
si = StringIO()
cw = csv.writer(si)
records = myTable.query.all() # or a filtered set, of course
# any table method that extracts an iterable will work
cw.writerows([(r.fielda, r.fieldb, r.fieldc) for r in records])
response = make_response(si.getvalue())
response.headers['Content-Disposition'] = 'attachment; filename=report.csv'
response.headers["Content-type"] = "text/csv"
return response

Title and description aren't indexed with collective.dexteritytextindexer

I have lots of Dexterity content types, some of them are just containers and are left with just the Title and Description (from plone.app.dexterity.behaviors.metadata.IBasic behavior).
I can find them by searching the text inside their title or description.
But for some complex content types I'm using collective.dexteritytextindexer to index some more fields and it works fine, I can find the text on the fields I marked to be indexed.
However the Title and Description are no longer available for searching. I tried something like:
class IMyContent(form.Schema):
"""My content type description
"""
dexteritytextindexer.searchable('title')
dexteritytextindexer.searchable('description')
dexteritytextindexer.searchable('long_desc')
form.widget(long_desc = WysiwygFieldWidget)
long_desc = schema.Text (
title = _(u"Rich description"),
description = _(u"Complete description"),
required = False,
)
...
But I can't see the content of title and description on the SearchableText column in the portal_catalog, and thus the results don't show them.
Any idea what I'm missing?
Cheers,
Got pretty much the same issue. Following the documentation on http://pypi.python.org/pypi/collective.dexteritytextindexer I used
from collective import dexteritytextindexer
from plone.autoform.interfaces import IFormFieldProvider
from plone.directives import form
from zope import schema
from zope.interface import alsoProvides
class IMyBehavior(form.Schema):
dexteritytextindexer.searchable('specialfield')
specialfield = schema.TextField(title=u'Special field')
alsoProvides(IMyBehavior, IFormFieldProvider)
to get my own fields indexed. However, the code
from plone.app.dexterity.interfaces import IBasic
from collective.dexteritytextindexer.utils import searchable
searchable(IBasic, 'title')
searchable(IBasic, 'description')
Didn't work. The import of IBasic fails. Seems this can easily be solved by importing
from plone.app.dexterity.behaviors.metadata import IBasic
The problem is probably that the field is coming from the IBasic or IDublineCore behaviour and not from your schema. I don't know enough about collective.dexteritytextindexer to know how to work around this, though.
Another option may be to just use plone.indexer and create your own SearchableText indexer that returns "%s %s %s" % (context.title, context.description, context.long_desc,). See the Dexterity docs for details.
As a reference this is the code I ended up writing:
#indexer(IMyDexterityType)
def searchableIndexer(context):
transforms = getToolByName(context, 'portal_transforms')
long_desc = context.long_desc // long_desc is a rich text field
if long_desc is not None:
long_desc = transforms.convert('html_to_text', long_desc).getData()
contacts = context.contacts // contacts is also a rich text field
if contacts is not None:
contacts = transforms.convert('html_to_text', contacts).getData()
return "%s %s %s %s" % (context.title, context.description, long_desc, contacts,)
grok.global_adapter(searchableIndexer, name="SearchableText")