Append URL's in a given substring from a range of integers - scrapy

I am trying to format the URL's of a given file.
The URL's have this format:
http://randomtext&of=randomtext&=.com
There is an init method that reads the url's from a file:
def __init__(self,filename=None):
if filename:
with open('urls.txt') as f:
self.start_urls = [url.strip() for url in f.readlines()]
I can append on the end of the url with a range of integers:
def __init__(self,filename=None):
if filename:
with open('urls.txt') as f:
self.start_urls = [url.strip() for url in f.readlines()]
self.start_urls = [url + str(i*10) for i in range(0,25) for url in self.start_urls]
My question is how to append the string on the substring 'of=' on the above URL's with a number of integers eg:
http://randomtext&of=1randomtext
http://randomtext&of=2randomtext
http://randomtext&of=3randomtext
Thank you in advance!

You could use the sub() function from pythons re-module to replace the numbers in the urls with your numbers from a given range.
So build upon your provided example, one possible solution could be this:
import re
def __init__(self,filename=None):
if filename:
with open('urls.txt') as f:
self.start_urls = [url.strip() for url in f.readlines()]
self.start_urls = [re.sub("[0-9][0-9]*", str(i*10), url) for i in range(0,25) for url in self.start_urls]
You may want to check if there are other numbers in your urls that you don't want to be substituted and change the regex accordingly.

Related

Need to return Scrapy callback method data to calling function

In below code I am trying to collect email ids from a website. It can be on contact or about us page.
From parse method I follow extemail method for all those pages.
From every page I collected few email ids.
Now I need to print them with original record sent to init method.
For example:
record = "https://www.wockenfusscandies.com/"
I want to print output as,
https://www.wockenfusscandies.com/|abc#gamil.com|def#outlook.com
I am not able to store them in self.emails and deliver back to init method.
Please help.
import scrapy
from scrapy.crawler import CrawlerProcess
class EmailSpider(scrapy.Spider):
def __init__(self, record):
self.record = record
self.emails = []
url = record.split("|")[4]
if not url.startswith("http"):
url = "http://{}".format(url)
if url:
self.start_urls = ["https://www.wockenfusscandies.com/"]
else:
self.start_urls = []
def parse(self, response):
contact_list = [a.attrib['href'] for a in response.css('a') if 'contact' in a.attrib['href'] or 'about' in a.attrib['href']]
contact_list.append(response.request.url)
for fllink in contact_list:
yield response.follow(fllink, self.extemail)
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
yield {
'emails': emails
}
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
f = open("/Users/kalpesh/work/data/test.csv")
for rec in f:
process.crawl(EmailSpider, record=rec)
f.close()
process.start()
If I understand your intend correctly you could try the following proceeding:
a) collect the mail-ids in self.emails like
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
self.emails = emails.copy()
yield {
'emails': emails
}
(Or on what other way you get the email-ids from emails)
b) add a close(self, reason) method as in GitHub-Example which is called when the spider has finished
def close(self, reason):
mails_for_record = ""
for mail in self.emails:
mails_for_record += mail + "|"
print(self.record + mails_for_record)
Please also note, I read somewhere that for some versions of Scrapy it is def close(self, reason), for others it is def closed(self, reason).
Hope, this proceeding helps you.
You should visit all the site pages before yielding result for this one site.
This means that you should have queue of pages to visit and results storage.
It can be done using meta.
Some pseudocode:
def parse(self, response):
meta = response.meta
if not meta.get('seen'):
# -- finding urls of contact and about us pages --
# -- putting it to meta['queue'] --
# -- setting meta['seen'] = True
page_emails_found = ...getting emails here...
# --- extending already discovered emails
# --- from other pages/initial empty list with new ones
meta['emails'].extend(page_emails_found)
# if queue isn't empty - yielding new request
if meta['queue']:
next_url = meta['queue'].pop()
yield Request(next_url, callback=self.parse, meta=copy(meta))
# if queue is empty - yielding result from meta
else:
yield {'url': current_domain, 'emails': meta['emails']}
Something like this..

Replace occurrences on html file

I have to replace some kind of occurrences on thousands of html files and I'm intendind to use linux script for this.
Here are some examples of replaces I have to do
From: <a class="wiki_link" href="/WebSphere+Application+Server">
To: <a class="wiki_link" href="/confluence/display/WIKIHAB1/WebSphere%20Application%20Server">
That means, add /confluence/display/WIKIHAB1 as prefix and replace "+" by "%20".
I'll do the same for other tags, like img, iframe, and so on...
First, which tool should I use to make it? Sed? Awk? Other?
If anybody has any example, I really appreciate.
After some research I found out Beautiful Soup. It's a python library to parse html files, really easy to use and very well docummented.
I had no experience with Python and could wrote the code without problems.
Here is an example of python code to make the replace that I mentioned in the question.
#!/usr/bin/python
import os
from bs4 import BeautifulSoup
#Replaces plus sign(+) by %20 and add /confluence... prefix to each
#href parameter at anchor(a) tag that has wiki_link in class parameter
def fixAnchorTags(soup):
tags = soup.find_all('a')
for tag in tags:
newhref = tag.get("href")
if newhref is not None:
if tag.get("class") is not None and "wiki_link" in tag.get("class"):
newhref = newhref.replace("+", "%20")
newhref = "/confluence/display/WIKIHAB1" + newhref
tag['href'] = newhref
#Creates a folder to save the converted files
def setup():
if not os.path.exists("converted"):
os.makedirs("converted")
#Run all methods for each html file in the current folder
def run():
for file in os.listdir("."):
if file.endswith(".html"):
print "Converting " + file
htmlfile = open(file, "r")
converted = open("converted/"+file, "w")
soup = BeautifulSoup(htmlfile, "html.parser")
fixAnchorTags(soup)
converted.write(soup.prettify("UTF-8"))
converted.close()
htmlfile.close()
setup()
run()

Relative URL to absolute URL Scrapy

I need help to convert relative URL to absolute URL in Scrapy spider.
I need to convert links on my start pages to absolute URL to get the images of the scrawled items, which are on the start pages. I unsuccessfully tried different ways to achieve this and I'm stuck. Any suggestion?
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/billboard",
"http://www.example.com/billboard?page=1"
]
def parse(self, response):
image_urls = response.xpath('//div[#class="content"]/section[2]/div[2]/div/div/div/a/article/img/#src').extract()
relative_url = response.xpath(u'''//div[contains(concat(" ", normalize-space(#class), " "), " content ")]/a/#href''').extract()
for image_url, url in zip(image_urls, absolute_urls):
item = ExampleItem()
item['image_urls'] = image_urls
request = Request(url, callback=self.parse_dir_contents)
request.meta['item'] = item
yield request
There are mainly three ways to achieve that:
Using urljoin function from urllib:
from urllib.parse import urljoin
# Same as: from w3lib.url import urljoin
url = urljoin(base_url, relative_url)
Using the response's urljoin wrapper method, as mentioned by Steve.
url = response.urljoin(relative_url)
If you also want to yield a request from that link, you can use the handful response's follow method:
# It will create a new request using the above "urljoin" method
yield response.follow(relative_url, callback=self.parse)

scrapy handle hebrew (non-english) language

I am using scrapy to scrap a hebrew website. However even after encoding scrapped data into UTF-8, I am not able to get the hewbrew character.
Getting weird string(× ×¨×¡×™ בעמ) in CSV. However If I check print same item, I am able to see the correct string on terminal.
Following is the website I am using.
http://www.moch.gov.il/rasham_hakablanim/Pages/pinkas_hakablanim.aspx
class Spider(BaseSpider):
name = "moch"
allowed_domains = ["www.moch.gov.il"]
start_urls = ["http://www.moch.gov.il/rasham_hakablanim/Pages/pinkas_hakablanim.aspx"]
def parse(self, response):
data = {'ctl00$ctl13$g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d$ctl00$cboAnaf': unicode(140),
'SearchFreeText:': u'חפש',
'ctl00$ctl13$g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d$ctl00$txtShemKablan': u'',
'ctl00$ctl13$g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d$ctl00$txtMisparYeshut': u'',
'ctl00$ctl13$g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d$ctl00$txtShemYeshuv': u'הקלד יישוב',
'ctl00$ctl13$g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d$ctl00$txtMisparKablan': u'',
'ctl00$ctl13$g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d$ctl00$btnSearch': u'חפש',
'ctl00$ScriptManager1': u'ctl00$ctl13$g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d$ctl00$UpdatePanel1|ctl00$ctl13$g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d$ctl00$btnSearch'}
yield FormRequest.from_response(response,
formdata=data,
callback = self.fetch_details,
dont_click = True)
def fetch_details(self, response):
# print response.body
hxs = HtmlXPathSelector(response)
item = MochItem()
names = hxs.select("//table[#id='ctl00_ctl13_g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d_ctl00_gridRashamDetails']//tr/td[2]/font/text()").extract()
phones = hxs.select("//table[#id='ctl00_ctl13_g_dbcc924d_5066_4fee_bc5c_6671d3e2c06d_ctl00_gridRashamDetails']//tr/td[6]/font/text()").extract()
index = 0
for name in names:
item['name'] = name.encode('utf-8')
item['phone'] = phones[index].encode('utf-8')
index += 1
print item # This is printed correctly on termial.
yield item # If I create a CSV output file. Then I am not able to see proper Hebrew String
The weird thing is, If i open the same csv in notepad++. I am able to see the correct output. So as a workaroud. What i did is, I opened the csv in notepad++ and change the encoding to UTF-8. And saved it. Now when i again open the csv in excel it shows me the correct hebrew string.
Is there anyway to specify the CSV encoding, from within scrapy ?

Create a cased redirection for small cased file in Amazon S3

Here is the situation:
I have a static website host on Amazon S3. All files in it are small letters, for example: file.html
I am looking for a script/program/tool to find all small letter files in a S3 site and create several cased 301 redirection.
E.g. Create File.html and FILE.html two files and use the new 301 redirect feature to redirect the requests with capital letters to small letters real file.
Please advice
I've hacked together a script which does what you want. It's not well rounded by all means but should do the trick. I've put it up on GitHub at https://github.com/mikewirth/s3-caseredirect.
Usage:
python makeredirects.py access_code secret bucketname key_for_your_file
I've tried a version which uses the Redirection Rules feature, but that didn't work because there is a limit of around 20 rules. This script will therefore create LOTS of empty keys.
For completeness and because it's so small here's the script:
#!/usr/bin/env python
"""
This script takes a file on S3 and creates a redirect from every possible
permutation of case to the original file.
Author: Michael Wirth (https://github.com/mikewirth/s3-caseredirect/)
"""
import sys
import os.path
import argparse
try:
import boto.s3.connection
except:
print "boto library (http://code.google.com/p/boto/) for aws needs to be installed"
sys.exit(1)
filenames = None
def make_case_insensitive(bucket, access, secret, key):
""" Get filename permutations """
global filenames
filenames = []
filename = os.path.basename(key)
path = os.path.dirname(key)
filename_permutations(filename)
connection = boto.s3.connection.S3Connection(access, secret, True)
b = connection.get_bucket(bucket)
for fname in filenames:
if fname == filename:
continue
k = b.new_key(os.path.join(path, fname))
k.set_redirect(key)
def filename_permutations(filename, pos=0):
if len(filename) == pos:
filenames.append(filename)
else:
upper = filename[:pos] + filename[pos:pos+1].upper() + filename[pos+1:]
lower = filename[:pos] + filename[pos:pos+1].lower() + filename[pos+1:]
if upper != lower:
filename_permutations(upper, pos+1)
filename_permutations(lower, pos+1)
else:
filename_permutations(filename, pos+1)
def main():
""" CLI """
parser = argparse.ArgumentParser()
parser.add_argument("access", help="AWS credentials: access code")
parser.add_argument("secret", help="AWS credentials: secret")
parser.add_argument("bucket", help="Name of Amazon S3 bucket")
parser.add_argument("key", help="Name of the key to make available case-insensitively. (Starts with a slash.)")
args = parser.parse_args()
make_case_insensitive(args.bucket, args.access, args.secret, args.key)
if __name__ == "__main__":
main()