error youtube v3 python response not ready - api

I use a python script to search for video information with youtube v3 api. On one computer the script works perfectly, but on another it receive the following error:
File "script.py", line 105, in youtube_search(options)
File "script.py", line 16, in youtube_search developerKey = DEVELOPER_KEY
.....
File "C:\Python27\lib\httplib.py", line 1013, in getresponse raise ResponseNotReady()
httplib.ResponseNotReady.
The youtube_search() function that I'm using is:
def youtube_search(options):
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
search_response = youtube.search().list(
q = options.q,
part = "id, snippet",
maxResults=options.maxResults
).execute()
videos = []
channels = []
playlists = []
videoInfo = []
t = datetime.datetime.now()
ff_name = '[' + options.q + '].txt'
f = open(ff_name, 'w')
no_results = search_response.get("pageInfo")["totalResults"]
page = 1
while (page <= (no_results / 50)):
nextPage = search_response.get("nextPageToken")
for search_result in search_response.get("items", []):
if search_result["id"]["kind"] == "youtube#video":
info = youtube.videos().list(
part = "statistics,contentDetails"
,id = search_result["id"]["videoId"]).execute()
for info_result in info.get("items", []):
videos.append("%s<|>%s<|>%s" % (
time.strftime("%x")
,nextPage
,search_result["snippet"]["title"]
)
f.write(str(videos))
f.write('\n')
videos = []
page = page + 1
search_response = youtube.search().list(
q = options.q,
part = "id,snippet",
maxResults = options.maxResults,
pageToken = nextPage
).execute()
Do you have any hints on why I encounter this behavior?
Thanks.

That specific exception is explained at Python httplib ResponseNotReady
One thing to point out, though, is that you don't need to perform a separate youtube.videos.list() call for each video id. You can pass up to 50 comma-separate video ids as the id= parameter to a single youtube.videos.list() call. Cutting down on the number of HTTP requests you're making will lead to better performance and may work around the exception.

Related

how to get the details of ASR VOSK

I have working with Vosk and I need to get the time of each word in my file.mp3 this is my code
def voice_recognition(filename):
model = Model(model_name="vosk-model-fa-0.5")
rec = KaldiRecognizer(model, FRAME_RATE)
rec.SetWords(True)
mp3 = AudioSegment.from_mp3(filename)
mp3 = mp3.set_channels(CHANNELS)
mp3 = mp3.set_frame_rate(FRAME_RATE)
step = 45000
transcript = ""
for i in range(0, len(mp3), step):
segment = mp3[i:i+step]
rec.AcceptWaveform(segment.raw_data)
result = rec.Result()
text = json.loads(result)["text"]
transcript += text
return transcript
I need something like this
time word
-----------------------
(0.0.01, 0.0.2) hi
(0.0.03, 0.0.4) how
(0.0.04, 0.0.5) are
(0.0.05, 0.0.6) you
is there any way get the data like this?
I just found all I need are already there when you set the rec.SetWords(True) all the details are in result = rec.Result()

How to improve the speed of getting request content via the request module

The below functions extract content from 'http://thegreyhoundrecorder.com.au/form-guides/' and append all content to a list. The function works fine, although the speed at which the content is scraped from the website is slow. This line tree = html.fromstring(page.content) in particular slows down the process. Is there a way I can improve on the speed of my request.
import lxml
from lxml import html
import requests
import re
import pandas as pd
from requests.exceptions import ConnectionError
greyhound_url = 'http://thegreyhoundrecorder.com.au/form-guides/'
def get_page(url):
"""fxn take page url and return the links to the acticle(Field) we
want to scrape in a list.
"""
page = requests.get(url)
tree = html.fromstring(page.content)
my_list = tree.xpath('//tbody/tr/td[2]/a/#href') # grab all link
print('Length of all links = ', len(my_list))
my_url = [page.url.split('/form-guides')[0] + str(s) for s in my_list]
return my_url
def extract_data(my_url):
"""
fxn take a list of urls and extract the needed infomation from
greyhound website.
return: a list with the extracted field
"""
new_list = []
try:
for t in my_url:
print(t)
page_detail = requests.get(t)
tree_1 = html.fromstring(page_detail.content)
title = ''.join(tree_1.xpath('//div/h1[#class="title"]/text()'))
race_number = tree_1.xpath("//tr[#id = 'tableHeader']/td[1]/text()")
Distance = tree_1.xpath("//tr[#id = 'tableHeader']/td[3]/text()")
TGR_Grade = tree_1.xpath("//tr[#id = 'tableHeader']/td[4]/text()")
TGR1 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[1]/text()")
TGR2 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[2]/text()")
TGR3 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[3]/text()")
TGR4 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[4]/text()")
clean_title = title.split(' ')[0].strip()
#clean title and extract track number
Track = title.split(' ')[0].strip()
#clean title and extract track date
date = title.split('-')[1].strip()
#clean title and extract track year
year = pd.to_datetime('now').year
#convert date to pandas datetime
race_date = pd.to_datetime(date + ' ' + str(year)).strftime('%d/%m/%Y')
#extract race number
new_rn = []
for number in race_number:
match = re.search(r'^(.).*?(\d+)$', number)
new_rn.append(match.group(1) + match.group(2))
new_list.append((race_date,Track,new_rn,Distance,TGR_Grade,TGR1,TGR2,TGR3,TGR4))
return new_list
except ConnectionError as e:
print('Connection error, connect to a stronger network or reload the page')

Why wont the NASA pictures display?

The pictures are not displaying. The code executes just fine. I took out the api key.
def gimmePictures(num):
for n in range(0,num):
now = datetime.datetime.now()
day4Pictures= now - datetime.timedelta(days = n)
data = {'api_key':'',
'date':day4Pictures.date()}
print(data)
# using the paramas argument in our request
result = requests.get('https://api.nasa.gov/planetary/apod',params=data)
# create a dictionary for yesterday's picture
dict_day = result.json()
print(dict_day['date'])
Image(dict_day['url'])
gimmePictures(10)
How can I display an image from a file in Jupyter Notebook?
def gimmePictures(num):
listofImageNames=[]
for n in range(0,num):
now = datetime.datetime.now()
day4Pictures= now - datetime.timedelta(days = n)
data = {'api_key':'dcS6cZ9DJ4zt9oXwjF6hgemj38bNJo0IGcvFGZZj', 'date':day4Pictures.date()}
# using the paramas argument in our request
result = requests.get('https://api.nasa.gov/planetary/apod',params=data)
# create a dictionary for yesterday's picture
dict_day = result.json()
listofImageNames.append(dict_day['url'])
for imageName in listofImageNames:
display(Image(imageName))
gimmePictures(10)

Get annotation text from its position (PDFMiner)

I want to extract the text of annotations (such as highlighted text of hyperlinks) from its position. For this I could scrape the positions and urls by using PDFminer as in the below code. Is that possible passing this position to a layout object and get out the text?
Here are the code blocks I used for this purpose.
First part includes a function, named parse_annotation, to parse annotations from each page.
def parse_annotations(page):
positions = []
urls = []
for annot in pdftypes.resolve1(page.annots):
if isinstance(annot, pdftypes.PDFObjRef):
annotationDict = annot.resolve()
# Skip over any annotations that are not links
if str(annotationDict["Subtype"]) != "/'Link'":
continue
destID = 0
position = annotationDict["Rect"]
uriDict = "None"
if any(k in annotationDict for k in {"Dest", "D"}):
destID = (annotationDict["Dest"][0]).objid
url = "Cross reference"
elif "A" in annotationDict:
# Key A contains PDFObjRef, then resolve it again
if isinstance(annotationDict["A"], pdftypes.PDFObjRef):
uriDict = pdftypes.resolve1(annotationDict["A"])
if any(k in uriDict for k in {"Dest", "D"}):
destID = (uriDict["D"][0]).objid
else:
uriDict = annotationDict["A"]
# Check if the key exists within resolved uriDict
if str(uriDict["S"]) == "/'GoTo'":
url = "Cross reference"
elif str(uriDict["S"]) == "/'URI'":
url = str(uriDict["URI"])
url = url.lstrip("b")
url = url.replace("'", "")
else:
# Skip if key S in uriDict does not contain value URI, GoTo
continue
else:
sys.stderr.write("Warning: unknown key in annotationDict : ", annotationDict)
#print(annot, '\n', annotationDict, '\n', destID, '\n', position, '\n', uriDict, '\n', url, '\n')
print(position, '\n', url, '\n')
positions.append(position)
urls.append(url)
else:
sys.stderr.write("Warning: unknown annotation: %s\n" % annot)
return positions, urls
Example PDF file can be found from the following link below.
https://www2.ed.gov/about/offices/list/ocr/docs/20200512-qa-psi-covid-19.pdf
Now, by using PDFMiner, created a document object and start looping over the pages found in PDF.
manager = PDFResourceManager()
output = StringIO()
codec = 'utf-8'
laparams = LAParams()
converter = TextConverter(manager, output, codec=codec, laparams=laparams)
device = PDFPageAggregator(manager, laparams=laparams)
interpreter = PDFPageInterpreter(manager, device)
page_interpreter = PDFPageInterpreter(manager, converter)
filename = '20200512-qa-psi-covid-19.pdf'
fp = open(filename, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
page_no = 0
for pageNumber, page in enumerate(PDFPage.create_pages(document)):
print("\n================ PageNumber ", pageNumber+1, "===================\n")
if pageNumber == page_no:
page_interpreter.process_page(page)
raw_text = output.getvalue()
output.truncate(0)
output.seek(0)
interpreter.process_page(page)
layout = device.get_result()
if page.annots:
positions, urls = parse_annotations(page)
for obj in layout:
print('Object name and position %s \t %s \n' % (obj.__class__.__name__ , obj.bbox))
page_no += 1
fp.close()
converter.close()
output.close()
device.close()
Thanks in advance,
A.

Why is scrapy suddenly giving me an *unpredictable* AttributeError, stating no attribute 'css'

For my job, I built a scrapy spider to quickly check in on ~200-500 website landing pages for clues that the pages are not functioning, outside of just 400-style errors. (e.g. check for the presence of "out of stock" on page.) This check happens across approx. 30 different websites under my purview, all of them using the same page structure.
This has worked fine, every day, for 4 months.
Then, suddenly, and without change to the code, I started getting unpredictable errors, about 4 weeks ago:
url_title = response.css("title::text").extract_first()
AttributeError: 'Response' object has no attribute 'css'
If I run this spider, this error will occur with, say... 3 out of 400 pages.
Then, if immediately run the spider again, those same 3 pages are scraped just fine without error, and 4 totally different pages will return the same error.
Furthermore, if I run the EXACT same spider as below, but replace mapping with just these 7 erroneous landing pages, they are scraped perfectly fine.
Is there something in my code that's not quite right??
I'm going to attach the whole code - sorry in advance!! - I just fear that something I might deem as superfluous may in fact be the cause. So this is the whole thing, but with sensitive data replaced with ####.
I've checked all of the affected pages, and of course the css is valid, and the title is always present.
I've done sudo apt-get update & sudo apt-get dist-upgrade on the server running scrapy, in hopes that this would help. No luck.
import scrapy
from scrapy import signals
from sqlalchemy.orm import sessionmaker
from datetime import date, datetime, timedelta
from scrapy.http.request import Request
from w3lib.url import safe_download_url
from sqlalchemy import and_, or_, not_
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
from sqlalchemy.engine import create_engine
engine = create_engine('mysql://######:#######localhost/LandingPages', pool_recycle=3600, echo=False)
#conn = engine.connect()
from LandingPageVerifier.models import LandingPagesFacebook, LandingPagesGoogle, LandingPagesSimplifi, LandingPagesScrapeLog, LandingPagesScrapeResults
Session = sessionmaker(bind=engine)
session = Session()
# today = datetime.now().strftime("%Y-%m-%d")
# thisyear = datetime.now().strftime("%Y")
# thismonth = datetime.now().strftime("%m")
# thisday = datetime.now().strftime("%d")
# start = date(year=2019,month=04,day=09)
todays_datetime = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
print todays_datetime
landingpages_today_fb = session.query(LandingPagesFacebook).filter(LandingPagesFacebook.created_on >= todays_datetime).all()
landingpages_today_google = session.query(LandingPagesGoogle).filter(LandingPagesGoogle.created_on >= todays_datetime).all()
landingpages_today_simplifi = session.query(LandingPagesSimplifi).filter(LandingPagesSimplifi.created_on >= todays_datetime).all()
session.close()
#Mix 'em together!
landingpages_today = landingpages_today_fb + landingpages_today_google + landingpages_today_simplifi
#landingpages_today = landingpages_today_fb
#Do some iterating and formatting work
landingpages_today = [(u.ad_url_full, u.client_id) for u in landingpages_today]
#print landingpages_today
landingpages_today = list(set(landingpages_today))
#print 'Unique pages: '
#print landingpages_today
# unique_landingpages = [(u[0]) for u in landingpages_today]
# unique_landingpage_client = [(u[1]) for u in landingpages_today]
# print 'Pages----->', len(unique_landingpages)
class LandingPage004Spider(scrapy.Spider):
name='LandingPage004Spider'
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(LandingPage004Spider, cls).from_crawler(crawler, *args, **kwargs)
#crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_closed(self, spider):
#stats = spider.crawler.stats.get_stats()
stats = spider.crawler.stats.get_value('item_scraped_count'),
Session = sessionmaker(bind=engine)
session = Session()
logitem = LandingPagesScrapeLog(scrape_count = spider.crawler.stats.get_value('item_scraped_count'),
is200 = spider.crawler.stats.get_value('downloader/response_status_count/200'),
is400 = spider.crawler.stats.get_value('downloader/response_status_count/400'),
is403 = spider.crawler.stats.get_value('downloader/response_status_count/403'),
is404 = spider.crawler.stats.get_value('downloader/response_status_count/404'),
is500 = spider.crawler.stats.get_value('downloader/response_status_count/500'),
scrapy_errors = spider.crawler.stats.get_value('log_count/ERROR'),
scrapy_criticals = spider.crawler.stats.get_value('log_count/CRITICAL'),
)
session.add(logitem)
session.commit()
session.close()
#mapping = landingpages_today
handle_httpstatus_list = [200, 302, 404, 400, 500]
start_urls = []
def start_requests(self):
for url, client_id in self.mapping:
yield Request(url, callback=self.parse, meta={'client_id': client_id})
def parse(self, response):
##DEBUG - return all scraped data
#wholepage = response.body.lower()
url = response.url
if 'redirect_urls' in response.request.meta:
redirecturl = response.request.meta['redirect_urls'][0]
if 'utm.pag.ca' in redirecturl:
url_shortener = response.request.meta['redirect_urls'][0]
else:
url_shortener = 'None'
else:
url_shortener = 'None'
client_id = response.meta['client_id']
url_title = response.css("title::text").extract_first()
# pagesize = len(response.xpath('//*[not(descendant-or-self::script)]'))
pagesize = len(response.body)
HTTP_code = response.status
####ERROR CHECK: Small page size
if 'instapage' in response.body.lower():
if pagesize <= 20000:
err_small = 1
else:
err_small = 0
else:
if pagesize <= 35000:
err_small = 1
else:
err_small = 0
####ERROR CHECK: Page contains the phrase 'not found'
if 'not found' in response.xpath('//*[not(descendant-or-self::script)]').extract_first().lower():
#their sites are full of HTML errors, making scrapy unable to notice what is and is not inside a script element
if 'dealerinspire' in response.body.lower():
err_has_not_found = 0
else:
err_has_not_found = 1
else:
err_has_not_found = 0
####ERROR CHECK: Page cotains the phrase 'can't be found'
if "can't be found" in response.xpath('//*[not(self::script)]').extract_first().lower():
err_has_cantbefound = 1
else:
err_has_cantbefound = 0
####ERROR CHECK: Page contains the phrase 'unable to locate'
if 'unable to locate' in response.body.lower():
err_has_unabletolocate = 1
else:
err_has_unabletolocate = 0
####ERROR CHECK: Page contains phrase 'no longer available'
if 'no longer available' in response.body.lower():
err_has_nolongeravailable = 1
else:
err_has_nolongeravailable = 0
####ERROR CHECK: Page contains phrase 'no service specials'
if 'no service specials' in response.body.lower():
err_has_noservicespecials = 1
else:
err_has_noservicespecials = 0
####ERROR CHECK: Page contains phrase 'Sorry, no' to match zero inventory for a search, which normally says "Sorry, no items matching your request were found."
if 'sorry, no ' in response.body.lower():
err_has_sorryno = 1
else:
err_has_sorryno = 0
yield {'client_id': client_id, 'url': url, 'url_shortener': url_shortener, 'url_title': url_title, "pagesize": pagesize, "HTTP_code": HTTP_code, "err_small": err_small, 'err_has_not_found': err_has_not_found, 'err_has_cantbefound': err_has_cantbefound, 'err_has_unabletolocate': err_has_unabletolocate, 'err_has_nolongeravailable': err_has_nolongeravailable, 'err_has_noservicespecials': err_has_noservicespecials, 'err_has_sorryno': err_has_sorryno}
#E-mail settings
def sendmail(recipients,subject,body):
fromaddr = "#######"
toaddr = recipients
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['Subject'] = subject
body = body
msg.attach(MIMEText(body, 'html'))
server = smtplib.SMTP('########)
server.starttls()
server.login(fromaddr, "##########")
text = msg.as_string()
server.sendmail(fromaddr, recipients, text)
server.quit()
`
Expected results is a perfect scrape, with no errors.
Actual results are unpredicatable AttributeErrors, claiming that attribute 'css' can't be found on some pages. But if I scrape those pages individually, using the same script, they scrape just fine.
Sometimes Scrapy can't parse HTML because of markup errors, that's why you can't call response.css(). You can catch these events in your code and analyze broken HTML:
def parse(self, response):
try:
....
your code
.....
except:
with open("Error.htm", "w") as f:
f.write(response.body)
UPDATE You can try to check for empty response:
def parse(self, response):
if not response.body:
yield scrapy.Request(url=response.url, callback=self.parse, meta={'client_id': response.meta["client_id"]})
# your original code