Scrapy best practice - scrapy

I'm using scrapy to download large amount of data. I use default 16 concurrent requests.
As a guide shows, I use pipelines method process_item to collect data at share variable. And at close_spider save data to SQL.
If I load too large website, I lose all system memory.
How should I avoid that problem?
Now I use one DB connection, that prepared at open_spider method and I could not use it in every process_item simultaneously.

Create a list of scraped items in your pipelines, and once that list's size is greater than N, then call the DB function to save data. Here is 100% working code from my project. See close_spider(), at the time of spider closed, there is a chance the self.items had less than N items in it, so any remaining data inside self.items list will also be saved in DB when spiders gets closed.
from scrapy import signals
class YourPipeline(object):
def __init__(self):
self.items = []
def process_item(self, item, spider):
self.items.extend([ item ])
if len(self.items) >= 50:
self.insert_current_items(spider)
return item
def insert_current_items(self, spider):
for item in self.items:
update_query = ', '.join(["`" + key + "` = %s " for key, value in item.iteritems()])
query = "SELECT asin FROM " + spider.tbl_name + " WHERE asin = %s LIMIT 1"
spider.cursor.execute(query, (item['asin']))
existing = spider.cursor.fetchone()
if spider.cursor.rowcount > 0:
query = "UPDATE " + spider.tbl_name + " SET " + update_query + ", date_update = CURRENT_TIMESTAMP WHERE asin = %s"
update_query_vals = list(item.values())
update_query_vals.extend([existing['YOUR_UNIQUE_COLUMN']])
try:
spider.cursor.execute(query, update_query_vals)
except Exception as e:
if 'MySQL server has gone away' in str(e):
spider.connectDB()
spider.cursor.execute(query, update_query_vals)
else:
raise e
else:
# This ELSE is likely never to get executed because we are not scraping ASINS from Amazon website, we just import ASINs into DB from another script
try:
placeholders = ', '.join(['%s'] * len(item))
columns = ', '.join(item.keys())
query = "INSERT INTO %s ( %s ) VALUES ( %s )" % (spider.tbl_name, columns, placeholders)
spider.cursor.execute(query, item)
except Exception as e:
if 'MySQL server has gone away' in str(e):
spider.connectDB()
spider.cursor.execute(query, item)
else:
raise e
self.items = []
def close_spider(self, spider):
self.insert_current_items(spider)

Related

Webcrawler - Scrapy Python

I need help with my webcrawler.
I got an invalid syntax here:
"f.write("{},{},{}\n".format(word,url,count))"
and also when I command "scrapy crawl FirstSpider > wordlist.csv" a csv file shows up but either is empty or not as structured as I want it to be.
I want to crawl 300 websites and need the data as structured as possible.
How can I get a csv file with the urls structured and then the count of the certain keywords next to it,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.item import Item
import requests
def find_all_substrings(string, sub):
import re
starts = [match.start() for match in re.finditer(re.escape(sub), string)]
return starts
class FirstSpider(CrawlSpider):
name = "FirstSpider"
allowed_domains = ["www.example.com"]
start_urls = ["https://www.example.com/"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]
crawl_count = 0
words_found = 0
def check_buzzwords(self, response):
self.__class__.crawl_count += 1
wordlist = [
"keyword1",
"keyword2",
"keyword3"
]
url = response.url
data = response.body.decode('utf-8')
count = 0
for word in wordlist:
substrings = find_all_substrings(data, word)
count = 0
word_counts = {}
links = []
"f = open('wordlist.csv', 'w')"
for pos in substrings:
ok = False
if not ok:
count += 1
word_counts[word] = {url: count}
for link in links:
page = requests.get(link)
data = page.text
for word in wordlist:
substrings = find_all_substrings(data, word)
count = 0
for word in wordlist:
substrings = find_all_substrings(data, word)
for pos in substrings:
ok = False
if not ok:
"f.write("{},{},{}\n".format(word,url,count))"
self.__class__.words_found += 1
print(word + ";" + url + ";" + str(count) + ";")
with open('wordlist.csv', 'w') as f:
for word, data in word_counts.items():
for url, count in data.items():
f.write("{},{},{}\n".format(word, url, count))
f.close()
return Item()
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
I want to crawl websites for certain keywords (wordlist). My output should be a csv file with the following information: url, count of keyword found on the website.
I got an invalid syntax for the following ``` "f.write("{},{},{}\n".format(word,url,count))"
And the output csv file is often empty or does not crawl all the urls.
You have unnecessary quotation marks around lines 41 and 61
line 41 ---> "f = open('wordlist.csv', 'w')"
line 61 ---> "f.write("{},{},{}\n".format(word,url,count))"
Also usually you don't need to manually save data to a file because Scrapy has a built-in mechanism - Feed export
By using FEED_EXPORT_FIELDS setting you can specify which fields of the item should be exported and their order.
Here is the command to run the spider and save data to a file:
scrapy crawl FirstSpider -O url.csv
-O (capital 'O') means "rewrite a file"
-o (lowercase 'o') means "append to an existent file".

Unable to iterate a list to a thread

I am trying to pass a json return between functions but I get errors. So I convert the json to a list. However, I cannot iterate the list from a while loop unless I specify an actual number.
Full code is
class BackendThread(QThread):
update_date = pyqtSignal(list)
def run(self):
device_mode = []
while True:
#do stuff and get json_return
for xx in json_return["result"]["devices"]:
for y in xx["nodes"]:
if y['type'] == "FRAME_BUFFER":
data = xx["device_id"] + "\n" + y['configuration']['display_mode']
device_mode.append(data)
self.update_date.emit(str(device_mode))
device_mode = []
time.sleep(1)
class Window(QDialog):
def __init__(self):
QDialog.__init__(self)
self.resize(400,400)
self.input=QTextEdit(self)
self.input.resize(400,400)
self.initUI()
def initUI(self):
self.backend=BackendThread()
self.backend.update_date.connect(self.handleDisplay)
self.backend.start()
def handleDisplay(self,data):
count = 0
while count < 11:
self.input.setText(data[count])
count += 1
if __name__ == '__main__':
app=QApplication(sys.argv)
win =Window()
win.show()
sys.exit(app.exec_())
So this part does not work. I only get the last item in the list
count = 0
while count < 11:
self.input.setText(data[count])
count += 1
When I do this, it works but I cannot hard code the item number becuase the list will never have the same amount of items
self.input.setText(data[0])
self.input.setText(data[1])
self.input.setText(data[2])
etc
Any ideas as to how to get that while loop working?

Why is scrapy suddenly giving me an *unpredictable* AttributeError, stating no attribute 'css'

For my job, I built a scrapy spider to quickly check in on ~200-500 website landing pages for clues that the pages are not functioning, outside of just 400-style errors. (e.g. check for the presence of "out of stock" on page.) This check happens across approx. 30 different websites under my purview, all of them using the same page structure.
This has worked fine, every day, for 4 months.
Then, suddenly, and without change to the code, I started getting unpredictable errors, about 4 weeks ago:
url_title = response.css("title::text").extract_first()
AttributeError: 'Response' object has no attribute 'css'
If I run this spider, this error will occur with, say... 3 out of 400 pages.
Then, if immediately run the spider again, those same 3 pages are scraped just fine without error, and 4 totally different pages will return the same error.
Furthermore, if I run the EXACT same spider as below, but replace mapping with just these 7 erroneous landing pages, they are scraped perfectly fine.
Is there something in my code that's not quite right??
I'm going to attach the whole code - sorry in advance!! - I just fear that something I might deem as superfluous may in fact be the cause. So this is the whole thing, but with sensitive data replaced with ####.
I've checked all of the affected pages, and of course the css is valid, and the title is always present.
I've done sudo apt-get update & sudo apt-get dist-upgrade on the server running scrapy, in hopes that this would help. No luck.
import scrapy
from scrapy import signals
from sqlalchemy.orm import sessionmaker
from datetime import date, datetime, timedelta
from scrapy.http.request import Request
from w3lib.url import safe_download_url
from sqlalchemy import and_, or_, not_
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
from sqlalchemy.engine import create_engine
engine = create_engine('mysql://######:#######localhost/LandingPages', pool_recycle=3600, echo=False)
#conn = engine.connect()
from LandingPageVerifier.models import LandingPagesFacebook, LandingPagesGoogle, LandingPagesSimplifi, LandingPagesScrapeLog, LandingPagesScrapeResults
Session = sessionmaker(bind=engine)
session = Session()
# today = datetime.now().strftime("%Y-%m-%d")
# thisyear = datetime.now().strftime("%Y")
# thismonth = datetime.now().strftime("%m")
# thisday = datetime.now().strftime("%d")
# start = date(year=2019,month=04,day=09)
todays_datetime = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
print todays_datetime
landingpages_today_fb = session.query(LandingPagesFacebook).filter(LandingPagesFacebook.created_on >= todays_datetime).all()
landingpages_today_google = session.query(LandingPagesGoogle).filter(LandingPagesGoogle.created_on >= todays_datetime).all()
landingpages_today_simplifi = session.query(LandingPagesSimplifi).filter(LandingPagesSimplifi.created_on >= todays_datetime).all()
session.close()
#Mix 'em together!
landingpages_today = landingpages_today_fb + landingpages_today_google + landingpages_today_simplifi
#landingpages_today = landingpages_today_fb
#Do some iterating and formatting work
landingpages_today = [(u.ad_url_full, u.client_id) for u in landingpages_today]
#print landingpages_today
landingpages_today = list(set(landingpages_today))
#print 'Unique pages: '
#print landingpages_today
# unique_landingpages = [(u[0]) for u in landingpages_today]
# unique_landingpage_client = [(u[1]) for u in landingpages_today]
# print 'Pages----->', len(unique_landingpages)
class LandingPage004Spider(scrapy.Spider):
name='LandingPage004Spider'
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(LandingPage004Spider, cls).from_crawler(crawler, *args, **kwargs)
#crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_closed(self, spider):
#stats = spider.crawler.stats.get_stats()
stats = spider.crawler.stats.get_value('item_scraped_count'),
Session = sessionmaker(bind=engine)
session = Session()
logitem = LandingPagesScrapeLog(scrape_count = spider.crawler.stats.get_value('item_scraped_count'),
is200 = spider.crawler.stats.get_value('downloader/response_status_count/200'),
is400 = spider.crawler.stats.get_value('downloader/response_status_count/400'),
is403 = spider.crawler.stats.get_value('downloader/response_status_count/403'),
is404 = spider.crawler.stats.get_value('downloader/response_status_count/404'),
is500 = spider.crawler.stats.get_value('downloader/response_status_count/500'),
scrapy_errors = spider.crawler.stats.get_value('log_count/ERROR'),
scrapy_criticals = spider.crawler.stats.get_value('log_count/CRITICAL'),
)
session.add(logitem)
session.commit()
session.close()
#mapping = landingpages_today
handle_httpstatus_list = [200, 302, 404, 400, 500]
start_urls = []
def start_requests(self):
for url, client_id in self.mapping:
yield Request(url, callback=self.parse, meta={'client_id': client_id})
def parse(self, response):
##DEBUG - return all scraped data
#wholepage = response.body.lower()
url = response.url
if 'redirect_urls' in response.request.meta:
redirecturl = response.request.meta['redirect_urls'][0]
if 'utm.pag.ca' in redirecturl:
url_shortener = response.request.meta['redirect_urls'][0]
else:
url_shortener = 'None'
else:
url_shortener = 'None'
client_id = response.meta['client_id']
url_title = response.css("title::text").extract_first()
# pagesize = len(response.xpath('//*[not(descendant-or-self::script)]'))
pagesize = len(response.body)
HTTP_code = response.status
####ERROR CHECK: Small page size
if 'instapage' in response.body.lower():
if pagesize <= 20000:
err_small = 1
else:
err_small = 0
else:
if pagesize <= 35000:
err_small = 1
else:
err_small = 0
####ERROR CHECK: Page contains the phrase 'not found'
if 'not found' in response.xpath('//*[not(descendant-or-self::script)]').extract_first().lower():
#their sites are full of HTML errors, making scrapy unable to notice what is and is not inside a script element
if 'dealerinspire' in response.body.lower():
err_has_not_found = 0
else:
err_has_not_found = 1
else:
err_has_not_found = 0
####ERROR CHECK: Page cotains the phrase 'can't be found'
if "can't be found" in response.xpath('//*[not(self::script)]').extract_first().lower():
err_has_cantbefound = 1
else:
err_has_cantbefound = 0
####ERROR CHECK: Page contains the phrase 'unable to locate'
if 'unable to locate' in response.body.lower():
err_has_unabletolocate = 1
else:
err_has_unabletolocate = 0
####ERROR CHECK: Page contains phrase 'no longer available'
if 'no longer available' in response.body.lower():
err_has_nolongeravailable = 1
else:
err_has_nolongeravailable = 0
####ERROR CHECK: Page contains phrase 'no service specials'
if 'no service specials' in response.body.lower():
err_has_noservicespecials = 1
else:
err_has_noservicespecials = 0
####ERROR CHECK: Page contains phrase 'Sorry, no' to match zero inventory for a search, which normally says "Sorry, no items matching your request were found."
if 'sorry, no ' in response.body.lower():
err_has_sorryno = 1
else:
err_has_sorryno = 0
yield {'client_id': client_id, 'url': url, 'url_shortener': url_shortener, 'url_title': url_title, "pagesize": pagesize, "HTTP_code": HTTP_code, "err_small": err_small, 'err_has_not_found': err_has_not_found, 'err_has_cantbefound': err_has_cantbefound, 'err_has_unabletolocate': err_has_unabletolocate, 'err_has_nolongeravailable': err_has_nolongeravailable, 'err_has_noservicespecials': err_has_noservicespecials, 'err_has_sorryno': err_has_sorryno}
#E-mail settings
def sendmail(recipients,subject,body):
fromaddr = "#######"
toaddr = recipients
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['Subject'] = subject
body = body
msg.attach(MIMEText(body, 'html'))
server = smtplib.SMTP('########)
server.starttls()
server.login(fromaddr, "##########")
text = msg.as_string()
server.sendmail(fromaddr, recipients, text)
server.quit()
`
Expected results is a perfect scrape, with no errors.
Actual results are unpredicatable AttributeErrors, claiming that attribute 'css' can't be found on some pages. But if I scrape those pages individually, using the same script, they scrape just fine.
Sometimes Scrapy can't parse HTML because of markup errors, that's why you can't call response.css(). You can catch these events in your code and analyze broken HTML:
def parse(self, response):
try:
....
your code
.....
except:
with open("Error.htm", "w") as f:
f.write(response.body)
UPDATE You can try to check for empty response:
def parse(self, response):
if not response.body:
yield scrapy.Request(url=response.url, callback=self.parse, meta={'client_id': response.meta["client_id"]})
# your original code

How to format input data for textsum data_convert_example

I was hoping someone may be able to see where I am failing here. So I have scraped some data from buzzfeed and now I am trying to format a text file with which I can then send into data_convert_examples text_to_data formatter.
I thought I had the answer a couple times, but I am still running up against a brick wall when I process this as binary and then try to train against the data.
What I did was run the binary_to_text on the toy dataset and then opened the file in notepad++ under windows, showing all characters, and matched what I believed to be the format.
I appologize for the long function below, but I really am unsure as to where the issue might be and figured this was the best way to provide enough info. Anyone have any ideas or recommendations?
def processPath(self, toPath):
try:
fout = open(os.path.join(toPath, '{}-{}'.format(self.baseName, self.fileNdx)), 'a+')
for path, dirs, files in os.walk(self.fromPath):
for fn in files:
fullpath = os.path.join(path, fn)
if os.path.isfile(fullpath):
#with open(fullpath, "rb") as f:
with codecs.open(fullpath, "rb", 'ascii', "ignore") as f:
try:
finalRes = ""
content = f.readlines()
self.populateVocab(content)
sentences = sent_tokenize((content[1]).encode('ascii', "ignore").strip('\n'))
for sent in sentences:
textSumFmt = self.textsumFmt
finalRes = textSumFmt["artPref"] + textSumFmt["sentPref"] + sent.replace("=", "equals") + textSumFmt["sentPost"] + textSumFmt["postVal"]
finalRes += (('\t' + textSumFmt["absPref"] + textSumFmt["sentPref"] + (content[0]).strip('\n').replace("=", "equals") + textSumFmt["sentPost"] + textSumFmt["postVal"]) + '\t' +'publisher=BUZZ' + os.linesep)
if self.lineNdx != 0 and self.lineNdx % self.lines == 0:
fout.close()
self.fileNdx+=1
fout = open(os.path.join(toPath, '{}-{}'.format(self.baseName, self.fileNdx)), 'a+')
fout.write( ("{}").format( finalRes.encode('utf-8', "ignore") ) )
self.lineNdx+=1
except RuntimeError as e:
print "Runtime Error: {0} : {1}".format(e.errno, e.strerror)
finally:
fout.close()
After further analysis, it seems that the source of the problem is more with the source data and the way it is constructed rather than data_convert_example.py itself. I'm closing this as the heading is not in-line with the source of the issue.
I found the source of my problem was that I had a space between "Article" and the equals sign. After removing that I was able to successfully train.

I just want to load 5GB from MySql into BigQuery

Long time no see. I'd want to get 5GB of data from MySql into BigQuery. My best bet seems to be some sort of CSV export / import. Which doesn't work for various reasons, see:
agile-coral-830:splitpapers1501200518aa150120052659
agile-coral-830:splitpapers1501200545aa150120055302
agile-coral-830:splitpapers1501200556aa150120060231
This is likely because I don't have the right MySql incantation able to generate perfect CSV in accordance with RFC 4180. However, instead of arguing RFC 4180 minutia, this whole load business could be solved in five minutes by supporting customizable multi-character field separators and multi-character line separators. I'm pretty sure my data doesn't contain either ### nor ###, so the following would work like a charm:
mysql> select * from $TABLE_NAME
into outfile '$DATA.csv'
fields terminated by '###'
enclosed by ''
lines terminated by '###'
$ bq load --nosync -F '###' -E '###' $TABLE_NAME $DATA.csv $SCHEMA.json
Edit: Fields contain '\n', '\r', ',' and '"'. They also contain NULLs, which MySql represents as [escape]N, in the example "N. Sample row:
"10.1.1.1.1483","5","9074080","Candidate high myopia loci on chromosomes 18p and 12q do not play a major role in susceptibility to common myopia","Results
There was no strong evidence of linkage of common myopia to these candidate regions: all two-point and multipoint heterogeneity LOD scores were < 1.0 and non-parametric linkage p-values were > 0.01. However, one Amish family showed slight evidence of linkage (LOD>1.0) on 12q; another 3 Amish families each gave LOD >1.0 on 18p; and 3 Jewish families each gave LOD >1.0 on 12q.
Conclusions
Significant evidence of linkage (LOD> 3) of myopia was not found on chromosome 18p or 12q loci in these families. These results suggest that these loci do not play a major role in the causation of common myopia in our families studied.","2004","BMC MEDICAL GENETICS","JOURNAL","N,"5","20","","","","0","1","USER","2007-11-19 05:00:00","rep1","PDFLib TET","0","2009-05-24 20:33:12"
I found loading through a CSV very difficult. More restrictions and complications. I have been messing around this morning with moving data from MySQL to BigQuery.
Bellow is a Python script that will build the table decorator and stream the data directly into the BigQuery table.
My db is in the Cloud so you may need to change the connection string. Fill in the missing values for your particular situation then call it by:
SQLToBQBatch(tableName, limit)
I put the limit in to test with. For my final test I sent 999999999 for the limit and everything worked fine.
I would recommend using a backend module to run this over 5g.
Use "RowToJSON" to clean up and invalid characters (ie anything non utf8).
I haven't tested on 5gb but it was able to do 50k rows in about 20 seconds. The same load in CSV was over 2 minutes.
I wrote this to test things, so please excuse the bad codding practices and mini hacks. It works so feel free to clean it up for any production level work.
import MySQLdb
import logging
from apiclient.discovery import build
from oauth2client.appengine import AppAssertionCredentials
import httplib2
OAUTH_SCOPE = 'https://www.googleapis.com/auth/bigquery'
PROJECT_ID =
DATASET_ID =
TABLE_ID =
SQL_DATABASE_NAME =
SQL_DATABASE_DB =
SQL_USER =
SQL_PASS =
def Connect():
return MySQLdb.connect(unix_socket='/cloudsql/' + SQL_DATABASE_NAME, db=SQL_DATABASE_DB, user=SQL_USER, passwd=SQL_PASS)
def RowToJSON(cursor, row, fields):
newData = {}
for i, value in enumerate(row):
try:
if fields[i]["type"] == bqTypeDict["int"]:
value = int(value)
else:
value = float(value)
except:
if value is not None:
value = value.replace("\x92", "'") \
.replace("\x96", "'") \
.replace("\x93", '"') \
.replace("\x94", '"') \
.replace("\x97", '-') \
.replace("\xe9", 'e') \
.replace("\x91", "'") \
.replace("\x85", "...") \
.replace("\xb4", "'") \
.replace('"', '""')
newData[cursor.description[i][0]] = value
return newData
def GetBuilder():
return build('bigquery', 'v2',http = AppAssertionCredentials(scope=OAUTH_SCOPE).authorize(httplib2.Http()))
bqTypeDict = { 'int' : 'INTEGER',
'varchar' : 'STRING',
'double' : 'FLOAT',
'tinyint' : 'INTEGER',
'decimal' : 'FLOAT',
'text' : 'STRING',
'smallint' : 'INTEGER',
'char' : 'STRING',
'bigint' : 'INTEGER',
'float' : 'FLOAT',
'longtext' : 'STRING'
}
def BuildFeilds(table):
conn = Connect()
cursor = conn.cursor()
cursor.execute("DESCRIBE %s;" % table)
tableDecorator = cursor.fetchall()
fields = []
for col in tableDecorator:
field = {}
field["name"] = col[0]
colType = col[1].split("(")[0]
if colType not in bqTypeDict:
logging.warning("Unknown type detected, using string: %s", str(col[1]))
field["type"] = bqTypeDict.get(colType, "STRING")
if col[2] == "YES":
field["mode"] = "NULLABLE"
fields.append(field)
return fields
def SQLToBQBatch(table, limit=3000):
logging.info("****************************************************")
logging.info("Starting SQLToBQBatch. Got: Table: %s, Limit: %i" % (table, limit))
bqDest = GetBuilder()
fields = BuildFeilds(table)
try:
responce = bqDest.datasets().insert(projectId=PROJECT_ID, body={'datasetReference' :
{'datasetId' : DATASET_ID} }).execute()
logging.info("Added Dataset")
logging.info(responce)
except Exception, e:
logging.info(e)
if ("Already Exists: " in str(e)):
logging.info("Dataset already exists")
else:
logging.error("Error creating dataset: " + str(e), "Error")
try:
responce = bqDest.tables().insert(projectId=PROJECT_ID, datasetId=DATASET_ID, body={'tableReference' : {'projectId' : PROJECT_ID,
'datasetId' : DATASET_ID,
'tableId' : TABLE_ID},
'schema' : {'fields' : fields}}
).execute()
logging.info("Added Table")
logging.info(responce)
except Exception, e:
logging.info(e)
if ("Already Exists: " in str(e)):
logging.info("Table already exists")
else:
logging.error("Error creating table: " + str(e), "Error")
conn = Connect()
cursor = conn.cursor()
logging.info("Starting load loop")
count = -1
cur_pos = 0
total = 0
batch_size = 1000
while count != 0 and cur_pos < limit:
count = 0
if batch_size + cur_pos > limit:
batch_size = limit - cur_pos
sqlCommand = "SELECT * FROM %s LIMIT %i, %i" % (table, cur_pos, batch_size)
logging.info("Running: %s", sqlCommand)
cursor.execute(sqlCommand)
data = []
for _, row in enumerate(cursor.fetchall()):
data.append({"json": RowToJSON(cursor, row, fields)})
count += 1
logging.info("Read complete")
if count != 0:
logging.info("Sending request")
insertResponse = bqDest.tabledata().insertAll(
projectId=PROJECT_ID,
datasetId=DATASET_ID,
tableId=TABLE_ID,
body={"rows":data}).execute()
cur_pos += batch_size
total += count
logging.info("Done %i, Total: %i, Response: %s", count, total, insertResponse)
if "insertErrors" in insertResponse:
logging.error("Error inserting data index: %i", insertResponse["insertErrors"]["index"])
for error in insertResponse["insertErrors"]["errors"]:
logging.error(error)
else:
logging.info("No more rows")
• Generate google service account key
o IAM & Admin > Service account > create_Service_account
o Once created then create key , download and save It to the project folder on local machine – google_key.json
• Run the code in pycharm environment after installing the packages.
NOTE : The table data in mysql remains intact. Also , if one uses preview in BQ to see that you won’t see. Go to console and fire the query.
o CODE
o import MySQLdb
from google.cloud import bigquery
import mysql.connector
import logging
import os
from MySQLdb.converters import conversions
import click
import MySQLdb.cursors
from google.cloud.exceptions import ServiceUnavailable
import sys
bqTypeDict = {'int': 'INTEGER',
'varchar': 'STRING',
'double': 'FLOAT',
'tinyint': 'INTEGER',
'decimal': 'FLOAT',
'text': 'STRING',
'smallint': 'INTEGER',
'char': 'STRING',
'bigint': 'INTEGER',
'float': 'FLOAT',
'longtext': 'STRING',
'datetime': 'TIMESTAMP'
}
def conv_date_to_timestamp(str_date):
import time
import datetime
date_time = MySQLdb.times.DateTime_or_None(str_date)
unix_timestamp = (date_time - datetime.datetime(1970, 1, 1)).total_seconds()
return unix_timestamp
def Connect(host, database, user, password):
return mysql.connector.connect(host='',
port='',
database='recommendation_spark',
user='',
password='')
def BuildSchema(host, database, user, password, table):
logging.debug('build schema for table %s in database %s' % (table, database))
conn = Connect(host, database, user, password)
cursor = conn.cursor()
cursor.execute("DESCRIBE %s;" % table)
tableDecorator = cursor.fetchall()
schema = []
for col in tableDecorator:
colType = col[1].split("(")[0]
if colType not in bqTypeDict:
logging.warning("Unknown type detected, using string: %s", str(col[1]))
field_mode = "NULLABLE" if col[2] == "YES" else "REQUIRED"
field = bigquery.SchemaField(col[0], bqTypeDict.get(colType, "STRING"), mode=field_mode)
schema.append(field)
return tuple(schema)
def bq_load(table, data, max_retries=5):
logging.info("Sending request")
uploaded_successfully = False
num_tries = 0
while not uploaded_successfully and num_tries < max_retries:
try:
insertResponse = table.insert_data(data)
for row in insertResponse:
if 'errors' in row:
logging.error('not able to upload data: %s', row['errors'])
uploaded_successfully = True
except ServiceUnavailable as e:
num_tries += 1
logging.error('insert failed with exception trying again retry %d', num_tries)
except Exception as e:
num_tries += 1
logging.error('not able to upload data: %s', str(e))
#click.command()
#click.option('-h', '--host', default='tempus-qa.hashmapinc.com', help='MySQL hostname')
#click.option('-d', '--database', required=True, help='MySQL database')
#click.option('-u', '--user', default='root', help='MySQL user')
#click.option('-p', '--password', default='docker', help='MySQL password')
#click.option('-t', '--table', required=True, help='MySQL table')
#click.option('-i', '--projectid', required=True, help='Google BigQuery Project ID')
#click.option('-n', '--dataset', required=True, help='Google BigQuery Dataset name')
#click.option('-l', '--limit', default=0, help='max num of rows to load')
#click.option('-s', '--batch_size', default=1000, help='max num of rows to load')
#click.option('-k', '--key', default='key.json',help='Location of google service account key (relative to current working dir)')
#click.option('-v', '--verbose', default=0, count=True, help='verbose')
def SQLToBQBatch(host, database, user, password, table, projectid, dataset, limit, batch_size, key, verbose):
# set to max verbose level
verbose = verbose if verbose < 3 else 3
loglevel = logging.ERROR - (10 * verbose)
logging.basicConfig(level=loglevel)
logging.info("Starting SQLToBQBatch. Got: Table: %s, Limit: %i", table, limit)
## set env key to authenticate application
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join(os.getcwd(), key)
print('file found')
# Instantiates a client
bigquery_client = bigquery.Client()
print('Project id created')
try:
bq_dataset = bigquery_client.dataset(dataset)
bq_dataset.create()
logging.info("Added Dataset")
except Exception as e:
if ("Already Exists: " in str(e)):
logging.info("Dataset already exists")
else:
logging.error("Error creating dataset: %s Error", str(e))
bq_table = bq_dataset.table(table)
bq_table.schema = BuildSchema(host, database, user, password, table)
print('Creating schema using build schema')
bq_table.create()
logging.info("Added Table %s", table)
conn = Connect(host, database, user, password)
cursor = conn.cursor()
logging.info("Starting load loop")
cursor.execute("SELECT * FROM %s" % (table))
cur_batch = []
count = 0
for row in cursor:
count += 1
if limit != 0 and count >= limit:
logging.info("limit of %d rows reached", limit)
break
cur_batch.append(row)
if count % batch_size == 0 and count != 0:
bq_load(bq_table, cur_batch)
cur_batch = []
logging.info("processed %i rows", count)
# send last elements
bq_load(bq_table, cur_batch)
logging.info("Finished (%i total)", count)
print("table created")
if __name__ == '__main__':
# run the command
SQLToBQBatch()
o Command to run the file : python mysql_to_bq.py -d 'recommendation_spark' -t temp_market_store -i inductive-cocoa-250507 -n practice123 -k key.json