Scrapy register a custom clean line by line item exporter - scrapy

So, this should have been simpler, and in the end it was quite simple, but the Scrapy documentations do leave some finding out to do ... So anyway this is a Q + A:
How to right Scrapy items "as is" line by line to a text file?

Basically you need to register an item exporter, and then tell Scrapy that you want to use it from command line:
Create a file named: lines_exporter.py:
from scrapy.exporters import BaseItemExporter
from scrapy.utils.python import to_bytes
class LinesExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
self._kwargs.setdefault('ensure_ascii', not self.encoding)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
data = ', '.join(itemdict.values()) + '\n'
self.file.write(to_bytes(data, self.encoding))
Add/edit the following in your settings.py:
FEED_EXPORTERS = {
'lines': 'project_name.lines_exporter.LinesExporter',
}
When invoking Scrapy from command line, specify the output-format flag, or:
-t lines
Enjoy!

Related

Empty .json file

I have written this short spider code to extract titles from hacker news front page(http://news.ycombinator.com/).
import scrapy
class HackerItem(scrapy.Item): #declaring the item
hackertitle = scrapy.Field()
class HackerSpider(scrapy.Spider):
name = 'hackernewscrawler'
allowed_domains = ['news.ycombinator.com'] # website we chose
start_urls = ['http://news.ycombinator.com/']
def parse(self,response):
sel = scrapy.Selector(response) #selector to help us extract the titles
item=HackerItem() #the item declared up
# xpath of the titles
item['hackertitle'] =
sel.xpath("//tr[#class='athing']/td[3]/a[#href]/text()").extract()
# printing titles using print statement.
print (item['hackertitle']
However when i run the code scrapy scrawl hackernewscrawler -o hntitles.json -t json
i get an empty .json file that does not have any content in it.
You should change print statement to yield:
import scrapy
class HackerItem(scrapy.Item): #declaring the item
hackertitle = scrapy.Field()
class HackerSpider(scrapy.Spider):
name = 'hackernewscrawler'
allowed_domains = ['news.ycombinator.com'] # website we chose
start_urls = ['http://news.ycombinator.com/']
def parse(self,response):
sel = scrapy.Selector(response) #selector to help us extract the titles
item=HackerItem() #the item declared up
# xpath of the titles
item['hackertitle'] = sel.xpath("//tr[#class='athing']/td[3]/a[#href]/text()").extract()
# return items
yield item
Then run:
scrapy crawl hackernewscrawler -o hntitles.json -t json

scrapy export the data to files by urls'path

How can I change the scrapy's sorce code , in orde to save files by the urls, when I export the data from HTML pages.
For example:
this pages (http://example/big/ppp) have lots of pages links
http://example/big/ppp/a
http://example/big/ppp/b
http://example/big/ppp/c
......
and I wnat to save the data from
http://example/big/ppp/a in d:/ppp/a.csv
http://example/big/ppp/b in d:/ppp/b.csv
http://example/big/ppp/c in d:/ppp/c.csv
because of this pages(http://example/big/ppp) have lots of links that like
http://example/big/ppp/a,http://example/big/ppp/b.
So could you help me, kind person!
You can use scrapy pipeline to do this job, add a field to the item you are going to export, for example named 'source' (http://example/big/ppp/a) to record where the item from:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class MyCsvPipeline(object):
def __init__(self):
self.csvfiles = {}
self.exporter = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def close_spider(self, spider):
for e in self.exporter.values():
e.finish_exporting()
for f in self.csvfiles.values():
f.close()
def process_item(self, item, spider):
csv = item['source'].split('/')[-1] + '.csv'
if csv not in self.csvfiles:
newfile = open('d:/ppp/'+csv, 'wb')
self.csvfiles[csv] = newfile
self.exporter[csv] = CsvItemExporter(newfile)
self.exporter[csv].start_exporting()
self.exporter[csv].export_item(item)
return item
apply this pipeline in settings.py
ITEM_PIPELINES = {
'xxxx.pipelines.MyCsvPipeline': 300,
}
another option
use scrapy crawl xxx -t csv -o all.csv --loglevel=INFO to export all items to a csv, then use another script to separate it into small csv according to 'source'.

NiFi: Remove fixed number of header lines from file

I'm processing a file and I'd like to remove (trim) the first X header lines to keep only data, possibly avoiding using regular expressions.
Thanks
You can remove the first X header lines by using ExecuteScript procesor in Nifi.
The following is a example Jython script which I wrote for myself:
import json
import java.io
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
text = IOUtils.readLines(inputStream, StandardCharsets.UTF_8)
for line in text[3:]:
outputStream.write(line + "\n")
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile,PyStreamCallback())
flowFile = session.putAttribute(flowFile, "filename", flowFile.getAttribute('filename').split('.')[0]+'_translated.json')
session.transfer(flowFile, REL_SUCCESS)
This obviously removes the first 3 lines but you can easily modify it to remove more or less lines.
Hope that helps.

Replace occurrences on html file

I have to replace some kind of occurrences on thousands of html files and I'm intendind to use linux script for this.
Here are some examples of replaces I have to do
From: <a class="wiki_link" href="/WebSphere+Application+Server">
To: <a class="wiki_link" href="/confluence/display/WIKIHAB1/WebSphere%20Application%20Server">
That means, add /confluence/display/WIKIHAB1 as prefix and replace "+" by "%20".
I'll do the same for other tags, like img, iframe, and so on...
First, which tool should I use to make it? Sed? Awk? Other?
If anybody has any example, I really appreciate.
After some research I found out Beautiful Soup. It's a python library to parse html files, really easy to use and very well docummented.
I had no experience with Python and could wrote the code without problems.
Here is an example of python code to make the replace that I mentioned in the question.
#!/usr/bin/python
import os
from bs4 import BeautifulSoup
#Replaces plus sign(+) by %20 and add /confluence... prefix to each
#href parameter at anchor(a) tag that has wiki_link in class parameter
def fixAnchorTags(soup):
tags = soup.find_all('a')
for tag in tags:
newhref = tag.get("href")
if newhref is not None:
if tag.get("class") is not None and "wiki_link" in tag.get("class"):
newhref = newhref.replace("+", "%20")
newhref = "/confluence/display/WIKIHAB1" + newhref
tag['href'] = newhref
#Creates a folder to save the converted files
def setup():
if not os.path.exists("converted"):
os.makedirs("converted")
#Run all methods for each html file in the current folder
def run():
for file in os.listdir("."):
if file.endswith(".html"):
print "Converting " + file
htmlfile = open(file, "r")
converted = open("converted/"+file, "w")
soup = BeautifulSoup(htmlfile, "html.parser")
fixAnchorTags(soup)
converted.write(soup.prettify("UTF-8"))
converted.close()
htmlfile.close()
setup()
run()

tkinter variable for drop down selection empty

I tried to program an app in tkinter that would load random lines from a file you select from a pull down menu and display the selected line in a text window.
It seems like the variable "var" in insert_text does not return the selected "option" but rather an "empty" string resulting in a the following error:
"File not found error" (FileNotFoundError: [Errno2] No such file or
directory: '').
Please help!
#!/usr/bin/env python
# Python 3
import tkinter
from tkinter import ttk
import random
class Application:
def __init__(self, root):
self.root = root
self.root.title('Random Stuff')
ttk.Frame(self.root, width=450, height=185).pack()
self.init_widgets()
var = tkinter.StringVar(root)
script = var.get()
choices = ['option1', 'option2', 'option3']
option = tkinter.OptionMenu(root, var, *choices)
option.pack(side='right', padx=10, pady=10)
def init_widgets(self):
ttk.Button(self.root, command=self.insert_txt, text='Button', width='10').place(x=10, y=10)
self.txt = tkinter.Text(self.root, width='45', height='5')
self.txt.place(x=10, y=50)
def insert_txt(self):
var = tkinter.StringVar(root)
name = var.get()
line = random.choice(open(str(name)).readlines())
self.txt.insert(tkinter.INSERT, line)
if __name__ == '__main__':
root = tkinter.Tk()
Application(root)
root.mainloop()
That's because you're just creating an empty StringVar that isn't modified later, thus returning an empty string.
The OptionMenu takes the command parameter that calls the specified method every time another option is selected. Now, you can call a method like this, replacing you insert_txt:
def __init__(self):
# ...
self.var = tkinter.StringVar()
self.options = tkinter.OptionMenu(root, var, *choices, command=self.option_selected)
# ...
def option_selected(self, event):
name = self.var.get()
# The stuff you already had
Additionally, you have to empty the Text widget, otherwise the previous text would stay. I think the Entry widget is better for that, too.