youtube_dl video descriptions - pandas

I have a df containing a set of videoIDs from YT:
import pandas as pd
data = {'Order': ['1', '2', '3'],
'VideoID': ['jxwHmAoKte4', 'LsXM502SpiU','1I3f27iQ4pM']
}
df = pd.DataFrame (data, columns = ['Order','VideoID'])
print (df)
and want to download the video descriptions and save them in the same df in an extra column.
I tried to use youtube_dl in Jupyter this way:
import youtube_dl
def all_descriptions(URL):
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
try:
youtube_dl.YoutubeDL(ydl_opts).download(URL)
return webpage
except:
pass
df['descriptions']=all_descriptions(URL)
I see the output of the code as text, but in df only "None" as text of the column.
Obviously I can't transport the output of the function to df in the proper way.
Can you suggest how to get it right?
Thank you in advance for help.

#perl
I modify the df to include two URLs that are causing two types of error:
import pandas as pd
data = {'Order': ['1', '2', '3', '4', '5'],
'VideoId': ['jxwHmAoKte4', 'LsXM502SpiU','1I3f27iQ4pM', 'MGQOX2rK5s', 'wNayw_E7lIA']
}
df = pd.DataFrame (data, columns = ['Order','VideoId'])
print (df)
Then I test it in the way you suggested, including my definition of ydl_opts:
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
df['description'] = [
youtube_dl.YoutubeDL(ydl_opts).extract_info(
u, download=False)['description'] for u in URL]
df
Reaching to the first error I get the output:
TypeError: 'NoneType' object is not subscriptable
After that I replace 'forcedescription' in my code with 'extract_info':
def all_descriptions(URL):
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
try:
youtube_dl.YoutubeDL(ydl_opts).download(URL)
return webpage
except:
pass
It skips all errors, but as the result there is nothing in the 'description'-column.
Any sugggestions?

You can use extract_info method:
df['description'] = [
youtube_dl.YoutubeDL().extract_info(
u, download=False)['description'] for u in URL]
df
Output:
Order VideoID description
0 1 jxwHmAoKte4 Bundesweit gelten sie nun ab heute, die schärf...
1 2 LsXM502SpiU Wie sicher ist der Impfstoff? Wäre eine Impfpf...
2 3 1I3f27iQ4pM Impfen ja oder nein, diese Frage stellen sich ...
P.S. The forcedescription parameter only prints the description to standard output, it doesn't return it
Update: extract_info returns None if it fails, so in case we have videos that may fail before getting the description from the info we can check that the info is not None:
ydl = youtube_dl.YoutubeDL(ydl_opts)
infos = [ydl.extract_info(u, download=False) for u in URL]
df['description'] = [
info['description'] if info is not None else ''
for info in infos]

Related

Handling errors within loops through exceptions

Tried my first python program to read temp sensor and output to influxdb
Occasionally temp sensor gives error "IndexError: list index out of range" and loop ends
I want loop to wait 15 seconds on this error and then continue the loop (sensor usually corrects itself by then on the next read)
My code:
import os
import glob
import time
import urllib
import urllib2
import httplib
import json
from influxdb import InfluxDBClient
client = InfluxDBClient(host='192.168.1.7', port=8086)
#client.get_list_database()
client.switch_database('influxdb1')
os.system('modprobe w1-gpio')
os.system('modprobe w1-therm')
base_dir = '/sys/devices/w1_bus_master1/'
device_folder = glob.glob(base_dir + '28*')[0]
while True:
device_file = device_folder + '/w1_slave'
def read_temp_raw():
f = open(device_file, 'r')
lines = f.readlines()
f.close()
return lines
def read_temp():
lines = read_temp_raw()
while lines[0].strip()[-3:] != 'YES':
time.sleep(0.2)
lines = read_temp_raw()
equals_pos = lines[1].find('t=')
if equals_pos != -1:
temp_string = lines[1][equals_pos+2:]
temp_c = float(temp_string) / 1000.0
return temp_c
temp = float(read_temp())
json_body = [
{
"measurement": "YOUR_MEASUREMENT",
"tags": {
"Device": "YOUR_DEVICE",
"ID": "YOUR_ID"
},
"fields": {
"outside_temp": temp,
}
}
]
client.write_points(json_body)
time.sleep(60)
******************************************************
which works ok :)
When I edit the code to catch the exception.....
******************************************************
while True:
except IndexError:
time.sleep(15)
continue
device_file = device_folder + '/w1_slave' # store the details
def read_temp_raw():
f = open(device_file, 'r')
lines = f.readlines() # read the device details
f.close()
return lines
def read_temp():
lines = read_temp_raw()
while lines[0].strip()[-3:] != 'YES':
time.sleep(0.2)
lines = read_temp_raw()
equals_pos = lines[1].find('t=')
if equals_pos != -1:
temp_string = lines[1][equals_pos+2:]
temp_c = float(temp_string) / 1000.0
return temp_c
temp = float(read_temp())
json_body = [
{
"measurement": "YOUR_MEASUREMENT",
"tags": {
"Device": "YOUR_DEVICE",
"ID": "YOUR_ID"
},
"fields": {
"outside_temp": temp,
}
}
]
client.write_points(json_body)
time.sleep(60)
************************************************************
I get following error...
File "temptoinfluxdb2.py", line 22
except IndexError:
^
SyntaxError: invalid syntax
Where am i going wrong please?
You will always need to use the except block in combination with a try block.
So the code in the try block is executed until an exception (in that case IndexError) occurs.
try:
# Execution block
except IndexError:
# Error handling
You could also use a more general approach with except Exception as e, which catches not just the IndexError but any exception.
Check the official documentation for further information.

Creating a registry with PySimpleGUI but I can't create the columns with the values ​of the keys

#error creating table with desired column
from PySimpleGUI import PySimpleGUI as sg
import pandas as pd
# Layout
sg.theme('Reddit')
layout = [
[sg.Text('Ticker: '), sg.Input(key='Ticker')],
[sg.Text('Quantidade de Papéis: '), sg.Input(key='Qtd_de_papeis')],
[sg.Text('Valor: '), sg.Input(key='Valor_pago')],
[sg.Text('Data:'), sg.Input(key='Data')],
[sg.Button('Adicionar'), sg.Button('Cancelar')]
]
# Janela
window = sg.Window('Tela de Cadastro', layout)
# Ler os eventos
while True:
events, values = window.read()
if events == (sg.WIN_CLOSED, 'Cancelar'):
break
if events == 'Adicionar':
columns = list(values.keys())
rows = list(values.values())
print(rows)
print(columns)
df.to_csv('registro.csv', sep=';', mode='a', index=False)
df_new = pd.read_csv('registro.csv', sep=';')
window.close()
print(df_new)
#I wanted to know if there is a way to assign columns or will it be necessary to create variables
Not sure if it work for you.
Here, it will create new CSV file when first record added.
You can check if file exist or not.
If exist, you can set headings = False before your event loop.
from PySimpleGUI import PySimpleGUI as sg
import pandas as pd
fields = {
'Ticker' : 'Ticker:',
'Qtd_de_papeis' : 'Quantidade de Papéis:',
'Valor_pago' : 'Valor:',
'Data' : 'Data:'
}
columns = list(fields.keys())
sg.theme('Reddit')
layout = [
[sg.Text(text), sg.Push(), sg.Input(key=key)] for key, text in fields.items()] + [
[sg.Button(button) for button in ('Adicionar', 'Cancelar')]
]
window = sg.Window('Tela de Cadastro', layout)
headings = True
while True:
events, values = window.read()
if events in (sg.WIN_CLOSED, 'Cancelar'):
break
if events == 'Adicionar':
df = pd.DataFrame({column:[] for column in columns})
df.loc[0] = [values[key] for key in columns]
if headings:
df.to_csv('registro.csv', sep=';', index=False)
else:
df.to_csv('registro.csv', sep=';', mode='a', index=False, header=False)
headings = False
df_new = pd.read_csv('registro.csv', sep=';')
print(df_new)
window.close()
Show, serviu muito bem.. Obrigado pela atenção.

Refactoring code so I dont have to implement 100+ functions

I'm making a crypto scanner which has to scan 100+ different cryptocoins at the same time. Now I'm having a really hard time simplifying this code because if I don't I'm gonna end up with more than 100 functions for something really easy. I'll post down here what I'm trying to refactor.
def main():
twm = ThreadedWebsocketManager(api_key=api_key,api_secret=api_secret)
twm.start()
dic = {'close': [], 'low': [], 'high': []}
dic2 = {'close': [], 'low': [], 'high': []}
def handle_socket_message(msg):
candle = msg['k']
close_price = candle['c']
highest_price = candle['h']
lowest_price = candle['l']
status = candle['x']
if status:
dic['close'].append(close_price)
dic['low'].append(lowest_price)
dic['high'].append(highest_price)
df = pd.DataFrame(dic)
print(df)
def handle_socket_message2(msg):
candle = msg['k']
close_price = candle['c']
highest_price = candle['h']
lowest_price = candle['l']
status = candle['x']
if status:
dic2['close'].append(close_price)
dic2['low'].append(lowest_price)
dic2['high'].append(highest_price)
df = pd.DataFrame(dic2)
print(df)
twm.start_kline_socket(callback=handle_socket_message, symbol='ETHUSDT')
twm.start_kline_socket(callback=handle_socket_message2, symbol='BTCUSDT')
twm.join()
As you can see I getting live data from BTCUSDT and ETHUSDT. Now I append the close,low and high prices to a dictionary and then I make a DataFrame out of those dictionaries. I tried to do this with 1 dictionary and 1 handle_socket_message function. But then it merges the values of both cryptocoins into 1 dataframe which is not what I want. Does anyone know how I can refactor this piece of code? I was thinking about something with a loop but I can't figure it out myself.
If you have any questions, ask away! Thanks in advance!
I don't know exactly what you are trying to do, but the following code might get you started (basically use a dict of dicts):
twm = ThreadedWebsocketManager(api_key=api_key,api_secret=api_secret)
twm.start()
symbols = ['ETHUSDT', 'BTCUSDT']
symbolToMessageKeys = {
'close': 'c',
'high': 'h',
'low': 'l'
}
dictPerSymbol = dict()
for sym in symbols:
d = dict()
dictPerSymbol[sym] = d
for key in symbolToMessageKeys.keys():
d[key] = list()
print(dictPerSymbol)
def handle_socket_message(msg):
candle = msg['k']
if candle['x']:
d = dictPerSymbol[msg['s']]
for (symbolKey, msgKey) in symbolToMessageKeys.items():
d[symbolKey].append(candle[msgKey])
df = pd.DataFrame(d)
print(df)
for sym in symbols:
twm.start_kline_socket(callback=handle_socket_message, symbol=sym)
twm.join()
Luckily, appending to lists seems thread safe. Warning: if it is not, then we have a major race condition in the code of this answer. I should also note that I haven't used neither ThreadedWebsocketManagers nor DataFrames (so the latter may as well introduce thread safety issues if it is meant to write in the provided dictionary).

GroupBy Function Not Applying

I am trying to groupby for the following specializations but I am not getting the expected result (or any for that matter). The data stays ungrouped even after this step. Any idea what's wrong in my code?
cols_specials = ['Enterprise ID','Specialization','Specialization Branches','Specialization Type']
specials = pd.read_csv(agg_specials, engine='python')
specials = specials.merge(roster, left_on='Enterprise ID', right_on='Enterprise ID', how='left')
specials = specials[cols_specials]
specials = specials.groupby(['Enterprise ID'])['Specialization'].transform(lambda x: '; '.join(str(x)))
specials.to_csv(end_report_specials, index=False, encoding='utf-8-sig')
Please try using agg:
import pandas as pd
df = pd.DataFrame(
[
['john', 'eng', 'build'],
['john', 'math', 'build'],
['kevin', 'math', 'asp'],
['nick', 'sci', 'spi']
],
columns = ['id', 'spec', 'type']
)
df.groupby(['id'])[['spec']].agg(lambda x: ';'.join(x))
resiults in:
if you need to preserve starting number of lines, use transform. transform returns one column:
df['spec_grouped'] = df.groupby(['id'])[['spec']].transform(lambda x: ';'.join(x))
df
results in:

python TypeError: expected string or buffer when parsing JSON from a file

I realize this problem has been answered for other folks but none of the threads are helping me solve it. I'm trying to parse a JSON structure and add all values in the sent_file when the keys match with the tweet_file. The error I'm getting
import sys
import json
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
scores = {}
#tweet = {}
#tweet_text = {}
#hw()
#lines(sent_file)
#lines(tweet_file)
for line in sent_file:
term,score = line.split("\t")
scores[term] = int(score)
#print scores.items()
for tweets in tweet_file:
current_sent_value = 0
tweet = {} #this is a dict
#print type(tweets) str
tweet = json.loads(tweets)#[0] #this assignment changes tweet to a list. Why?
if 'text' in tweet:
tweet_text = {}
unicode_string = tweet['text']
encoded_string = unicode_string.encode('utf-8')
tweet_text = encoded_string.split()
for key in tweet_text:
for key in scores:
#print type(tweet_text) -- list
#print type(scores) --dict
if tweet_text.get(key) == scores.get(key): # get() does not work on a list. tweet_text is a list.
current_sent_value += scores(value)
print current_sent_value
if name == 'main':
main()
The error is here \assignment1\tweet_sentiment2.py", line 42, in main
if tweet_text.get(key) == scores.get(key): # get() does not work on a list. tweet_text is a list.
AttributeError: 'list' object has no attribute 'get'