substring not found (trying to use YouTube API) - api

path = "D:\Data Analyst - YouTube\video\Marcelle Video"
api_key = "MY API"
youtube = build('youtube', 'v3', developerKey=api_key)
youtube_url = input("Enter Youtube URL: ")
video_id = youtube_url[youtube_url.index("?v=") + 3:]
I got this error:
Enter Youtube URL: https://youtu.be/W3CoM_xkn9E
Traceback (most recent call last):
File "d:\Data Analyst - YouTube\video\video.py", line 14, in
video_id = youtube_url[youtube_url.index("?v=") + 3:]
ValueError: substring not found
PS C:\Users\UTENTE>
Can you guys help me out here?

You have to check that the URL has the ?v= characters.
If it has it, use your current code.
Otherwise, modify your code as follows:
# This is the URL:
youtube_url = "https://youtu.be/W3CoM_xkn9E"
# Separate the elements by "/" character:
myArray = youtube_url.split("/")
# Get the latest element in the array:
video_id = myArray[-1]
# Print the result:
print("video_id: " + video_id)
Result:
video_id: W3CoM_xkn9E
Execute this code here at mycompiler.io.
N.B: YouTube links vary in its presentation, so, you have to check how to handle the video_url (i.e. youtube_url).

Related

Getting the root directory of a file in Google Drive using API

I need to get the full path of folders where a file is located in Google Drive. I'm getting the files themselves using the Google Drive API, but I need information about it's parent folders
I'm using the following code tothe the list of spreadsheets in a Shared Drive:
from googleapiclient import discovery
from httplib2 import Http
from oauth2client import file, client, tools
# Change the value of SCOPES to 'https://www.googleapis.com/auth/drive'
# if you want to be able to read and write to the user's Google Drive.
SCOPES = 'https://www.googleapis.com/auth/drive'
store = file.Storage('storage.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('client_secret.json', SCOPES)
creds = tools.run_flow(flow, store)
DRIVE = discovery.build('drive', 'v3', http=creds.authorize(Http()))
folder_id = "1Z1GzY-D3I3qwQu3oxIW-L1a9nXgD0PXl"
query = "mimeType='application/vnd.google-apps.spreadsheet'"
query+= "and fullText contains 'CLAS' and trashed = false"
# query += " and parents in '" + folder_id + "'"
spreadsheets = []
# Initialize the page token
next_page_token = None
# Loop until all pages of results have been retrieved
while True:
# Execute the list request
response = DRIVE.files().list(
q=query,
corpora='drive',
includeItemsFromAllDrives=True,
driveId='0AEJNMySKcEzsUk9PVA',
supportsAllDrives=True,
# orderBy='folder',
pageSize=1000,
fields='nextPageToken, files(id, name, parents, mimeType, webViewLink)',
pageToken=next_page_token,
).execute()
# Append the results to the list
spreadsheets.extend(response.get('files', []))
# Check if there is another page of results
next_page_token = response.get('nextPageToken', None)
if next_page_token is None:
break
# Set the page token for the next iteration
# parameters['pageToken'] = next_page_token
# Print the number of results
print(f'Last spreadsheet found: {spreadsheets[-1]["name"]}. Number of spreadsheets: {len(spreadsheets)}')
This returns a list of dictionaries with the specified fields. I would like to know the names of the parent folders for each file, for which I'm trying:
from googleapiclient.errors import HttpError
for item in spreadsheets:
if 'parents' in item:
parent_folders_list = []
parent_id = item['parents'][0]
try:
while parent_id:
folder=DRIVE.files().get(fileId=parent_id, fields='name, id, parents').execute()
parent_folders_list.append(folder.get("parents", []))
if parent_id:
parent_id = parent_id[0]
except HttpError as error:
print('An error occurred: %s' % error)
print(f'{item["name"]} is in {parent_folders_list}')
And I've been able to identify that parent_id is correctly retrieved, and that I am able to access it, as I was able to open it in the browser. However, I get back errors 'File Not Found' for all parent_id. I wonder if the DRIVE.files().get(fileId=) is the correct way to get back a folder using the API.
Any help would be greatly appreciated.

Webscraping customer review - Invalid selector error using XPath

I am trying to extract userid, rating and review from the following site using selenium and it is showing "Invalid selector error". I think, the Xpath I have tried to define to get the review text is the reason for error. But I am unable to resolve the issue. The site link is as below:
teslamotor review
The code that I have used is following:
#Class for Review webscraping from consumeraffairs.com site
class CarForumCrawler():
def __init__(self, start_link):
self.link_to_explore = start_link
self.comments = pd.DataFrame(columns = ['rating','user_id','comments'])
self.driver = webdriver.Chrome(executable_path=r'C:/Users/mumid/Downloads/chromedriver/chromedriver.exe')
self.driver.get(self.link_to_explore)
self.driver.implicitly_wait(5)
self.extract_data()
self.save_data_to_file()
def extract_data(self):
ids = self.driver.find_elements_by_xpath("//*[contains(#id,'review-')]")
comment_ids = []
for i in ids:
comment_ids.append(i.get_attribute('id'))
for x in comment_ids:
#Extract dates from for each user on a page
user_rating = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]/div[1]/div/img')[0]
rating = user_rating.get_attribute('data-rating')
#Extract user ids from each user on a page
userid_element = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]/div[2]/div[2]/strong')[0]
userid = userid_element.get_attribute('itemprop')
#Extract Message for each user on a page
user_message = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]/text()')[0]
comment = user_message.text
#Adding date, userid and comment for each user in a dataframe
self.comments.loc[len(self.comments)] = [rating,userid,comment]
def save_data_to_file(self):
#we save the dataframe content to a CSV file
self.comments.to_csv ('Tesla_rating-6.csv', index = None, header=True)
def close_spider(self):
#end the session
self.driver.quit()
try:
url = 'https://www.consumeraffairs.com/automotive/tesla_motors.html'
mycrawler = CarForumCrawler(url)
mycrawler.close_spider()
except:
raise
The error that I am getting is as following:
Also, The xpath that I tried to trace is from following HTML
You are seeing the classic error of...
as find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]/text()')[0] would select the attributes, instead you need to pass an xpath expression that selects elements.
You need to change as:
user_message = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]')[0]
References
You can find a couple of relevant detailed discussions in:
invalid selector: The result of the xpath expression "//a[contains(#href, 'mailto')]/#href" is: [object Attr] getting the href attribute with Selenium

How to get website to consistently return content from a GET request when it's inconsistent?

I posted a similar question earlier but I think this is a more refined question.
I'm trying to scrape: https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&PlayerMovementChkBx=yes&submit=Search&start=0
My code randomly throws errors when I send a GET request to the URL. After debugging, I saw the following happen. A GET request for the following url will be sent(Example URL, could happen on any page): https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&PlayerMovementChkBx=yes&submit=Search&start=2400
The webpage will then say "There were no matching transactions found.". However, if I refresh the page, the content will then be loaded. I'm using BeautifulSoup and Selenium and have put sleep statements in my code in hopes that it'll work but to no avail. Is this a problem on the website's end? It doesn't make sense to me how one GET request will return nothing but the exact same request will return something. Also, is there anything I could to fix it or is it out of control?
Here is a sample of my code:
t
def scrapeWebsite(url, start, stop):
driver = webdriver.Chrome(executable_path='/Users/Downloads/chromedriver')
print(start, stop)
madeDict = {"Date": [], "Team": [], "Name": [], "Relinquished": [], "Notes": []}
#for i in range(0, 214025, 25):
for i in range(start, stop, 25):
print("Current Page: " + str(i))
currUrl = url + str(i)
#print(currUrl)
#r = requests.get(currUrl)
#soupPage = BeautifulSoup(r.content)
driver.get(currUrl)
#Sleep program for dynamic refreshing
time.sleep(1)
soupPage = BeautifulSoup(driver.page_source, 'html.parser')
#page = urllib2.urlopen(currUrl)
#time.sleep(2)
#soupPage = BeautifulSoup(page, 'html.parser')
info = soupPage.find("table", attrs={'class': 'datatable center'})
time.sleep(1)
extractedInfo = info.findAll("td")
The error occurs at the last line. "findAll" complains because it can't find findAll when the content is null(meaning the GET request returned nothing)
I did some workaround to scrape all the page using try except.
Probably the requests loop it is so fast and the page can't support it.
See the example below, worked like a charm:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=' \
'&PlayerMovementChkBx=yes&submit=Search&start=%s'
def scrape(start=0, stop=214525):
for page in range(start, stop, 25):
current_url = URL % page
print('scrape: current %s' % page)
while True:
try:
response = requests.request('GET', current_url)
if response.ok:
soup = BeautifulSoup(response.content.decode('utf-8'), features='html.parser')
table = soup.find("table", attrs={'class': 'datatable center'})
trs = table.find_all('tr')
slice_pos = 1 if page > 0 else 0
for tr in trs[slice_pos:]:
yield tr.find_all('td')
break
except Exception as exception:
print(exception)
for columns in scrape():
values = [column.text.strip() for column in columns]
# Continuous your code ...

Better way to clean product description using BeautifulSoup?

I have written following code to fetch product description from a site using BeautifulSoup-
def get_soup(url):
try:
response = requests.get(url)
if response.status_code == 200:
html = response.content
return BeautifulSoup(html, "html.parser")
except Exception as ex:
print("error from " + url + ": " + str(ex))
def get_product_details(url):
try:
soup = get_soup(url)
prod_details = dict()
desc_list = soup.select('p ~ ul')
prod_details['description'] = ''.join(desc_list)
return prod_details
except Exception as ex:
logger.warning('%s - %s', ex, url)
if __name__ == '__main__':
get_product_details("http://www.aprisin.com.sg/p-748-littletikespoptunesguitar.html")
In above code I am trying to convert description(a list) to string but getting below issue-
[WARNING] aprisin.py:82 get_product_details() : sequence item 0: expected str instance, Tag found - http://www.aprisin.com.sg/p-748-littletikespoptunesguitar.html
Output of description without converting description to string-
[<ul>
<li>Freestyle</li>
<li>Play along with 5 pre-set tunes: </li>
</ul>, <ul>
<li>Each string will play a note</li>
<li>Guitar has a whammy bar</li>
<li>2-in-1 volume control and power button </li>
<li>Simple and easy to use </li>
<li>Helps develop music appreciation </li>
<li>Requires 3 "AA" alkaline batteries (included)</li>
</ul>]
You are passing a list of tags (Object) instead of string to join(). join() works with list of strings. Use the following code changes for join function:-
prod_details['description'] = ''.join([tag.get_text() for tag in desc_list])
or
prod_details['description'] = ''.join([tag.string for tag in desc_list])
In case you want the description along with html content, you can use the following:-
# this will preserve the html tags and indentation.
prod_details['description'] = ''.join([tag.prettify() for tag in desc_list])
or
# this will return the html content as string.
prod_details['description'] = ''.join([str(tag) for tag in desc_list])
desc_list is list of bs4.element.Tag. you should convert tag to string:
desc_list = soup.select('p ~ ul')
prod_details['description'] = str(desc_list[0])
You're trying to join a list of Tags, but the join method needs str arguments. Try:
''.join([str(i) for i in desc_list])

web2py rest api endpoint gives invalid path output

I have made a web2py web application. The api endpoints exposed are as follows.
"/comments[comments]"
"/comments/id/{comments.id}"
"/comments/id/{comments.id}/:field"
"/comments/user-id/{comments.user_id}"
"/comments/user-id/{comments.user_id}/:field"
"/comments/date-commented/{comments.date_commented.year}"
"/comments/date-commented/{comments.date_commented.year}/:field"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/:field"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/{comments.date_commented.day}"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/{comments.date_commented.day}/:field"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/{comments.date_commented.day}/{comments.date_commented.hour}"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/{comments.date_commented.day}/{comments.date_commented.hour}/:field"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/{comments.date_commented.day}/{comments.date_commented.hour}/{comments.date_commented.minute}"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/{comments.date_commented.day}/{comments.date_commented.hour}/{comments.date_commented.minute}/:field"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/{comments.date_commented.day}/{comments.date_commented.hour}/{comments.date_commented.minute}/{comments.date_commented.second}"
"/comments/date-commented/{comments.date_commented.year}/{comments.date_commented.month}/{comments.date_commented.day}/{comments.date_commented.hour}/{comments.date_commented.minute}/{comments.date_commented.second}/:field"
"/comments/complaint-id/{comments.complaint_id}"
"/comments/complaint-id/{comments.complaint_id}/:field"
The comments model is as follows
models/db.py
db.define_table(
'comments',
Field('user_id', db.auth_user),
Field('comment_made', 'string', length=2048),
Field('date_commented', 'datetime', default=datetime.now),
Field('complaint_id', db.complaints),
Field('detailed_status', 'string', length=2048),
)
I have been successful in retriving a single comment via the following request
localhost:8000/api/comments/id/1.json
Now I wish to retrieve all the comments. I am not able to figure out how to use /comments[comments] to retrieve all comments.?
I have tried
localhost:8000/api/comments.json
But it gives an output with "invalid path"
I have realized requests such as http://localhost:8000/api/comments/complaint-id/1.json
also give "invalid path" as output.
Please help.
EDIT:
Controllers/default.py
#request.restful()
def api():
response.view='generic.' + request.extension
def GET(*args,**kargs):
patterns='auto'
parser = db.parse_as_rest(patterns,args,kargs)
if parser.status == 200:
return dict(content=parser.response)
else:
raise HTTP(parser.status,parser.error)
def POST(*args,**kargs):
return dict()
return locals()
routes.py in the main web2py folder to change the default application:
routers = dict(
BASE = dict(
default_application='GRS',
)
)
Another observation:
I added another endpoint as below:
def comm():
"""" Comments api substitute"""
rows=db().select(db.comments.ALL) ## this line shows error
# rows = db(db.comments.id > 0).select()
#rows=[[5,6],[3,4],[1,2]]
#for row in rows:
# print row.id
return dict(message=rows)
Even now I am not able to retrieve all comments with "/comm.json". This gives a web2py error ticket which says "need more than 1 value to unpack" on the line "rows=db.select(db.comments.ALL)". Are the above invalid path and this error related in someway?