UnboundLocalError in Pandas

UnboundLocalError in Pandas - pandas

My task is to clean the data and present it in next format: DataFrame( [ ["Michigan", "Ann Arbor"], ["Michigan", "Yipsilanti"] ], columns=["State", "RegionName"] ).
I'm getting a mistake. How can I fix it?
"UnboundLocalError: local variable 'state' referenced before assignment"
The code is following:
with open ('university_towns.txt') as file:
data=[]
for line in file:
data.append(line[:-1])
state_town = []
for line in data:
if line[:-6]=='[edit]':
state=line[:-6]
elif '(' in line:
town = line[:line.index('(')-1]
state_town.append([state,town])
else:
town=line
state_town.append([state,town])
state_college_df=pd.DataFrame(state_town, columns=['State','RegionName'])
return state_college_df

Related

Handling errors within loops through exceptions

Tried my first python program to read temp sensor and output to influxdb
Occasionally temp sensor gives error "IndexError: list index out of range" and loop ends
I want loop to wait 15 seconds on this error and then continue the loop (sensor usually corrects itself by then on the next read)
My code:
import os
import glob
import time
import urllib
import urllib2
import httplib
import json
from influxdb import InfluxDBClient
client = InfluxDBClient(host='192.168.1.7', port=8086)
#client.get_list_database()
client.switch_database('influxdb1')
os.system('modprobe w1-gpio')
os.system('modprobe w1-therm')
base_dir = '/sys/devices/w1_bus_master1/'
device_folder = glob.glob(base_dir + '28*')[0]
while True:
device_file = device_folder + '/w1_slave'
def read_temp_raw():
f = open(device_file, 'r')
lines = f.readlines()
f.close()
return lines
def read_temp():
lines = read_temp_raw()
while lines[0].strip()[-3:] != 'YES':
time.sleep(0.2)
lines = read_temp_raw()
equals_pos = lines[1].find('t=')
if equals_pos != -1:
temp_string = lines[1][equals_pos+2:]
temp_c = float(temp_string) / 1000.0
return temp_c
temp = float(read_temp())
json_body = [
{
"measurement": "YOUR_MEASUREMENT",
"tags": {
"Device": "YOUR_DEVICE",
"ID": "YOUR_ID"
},
"fields": {
"outside_temp": temp,
}
}
]
client.write_points(json_body)
time.sleep(60)
******************************************************
which works ok :)
When I edit the code to catch the exception.....
******************************************************
while True:
except IndexError:
time.sleep(15)
continue
device_file = device_folder + '/w1_slave' # store the details
def read_temp_raw():
f = open(device_file, 'r')
lines = f.readlines() # read the device details
f.close()
return lines
def read_temp():
lines = read_temp_raw()
while lines[0].strip()[-3:] != 'YES':
time.sleep(0.2)
lines = read_temp_raw()
equals_pos = lines[1].find('t=')
if equals_pos != -1:
temp_string = lines[1][equals_pos+2:]
temp_c = float(temp_string) / 1000.0
return temp_c
temp = float(read_temp())
json_body = [
{
"measurement": "YOUR_MEASUREMENT",
"tags": {
"Device": "YOUR_DEVICE",
"ID": "YOUR_ID"
},
"fields": {
"outside_temp": temp,
}
}
]
client.write_points(json_body)
time.sleep(60)
************************************************************
I get following error...
File "temptoinfluxdb2.py", line 22
except IndexError:
^
SyntaxError: invalid syntax
Where am i going wrong please?

You will always need to use the except block in combination with a try block.
So the code in the try block is executed until an exception (in that case IndexError) occurs.
try:
# Execution block
except IndexError:
# Error handling
You could also use a more general approach with except Exception as e, which catches not just the IndexError but any exception.
Check the official documentation for further information.

I cannot save into excel the result of entire loop but only the last iteration

I would like to save the results of the entire iteration into an excel file but at the moment I save only the last run. Why is this happening and how to solve it?
I added a line 'append' but then I get an error message..
path = '../'
df1=[]
for file in os.listdir(path):
if file.endswith('.txt'):
with open(os.path.join(path, file)) as f:
df = pd.read_csv(f, sep="\t", header=0,usecols=[0,11])
df.columns = ["x", "y"]
abs_PAR=[]
mean1=[]
for (x, y) in df.iteritems():
abs_PAR = sum(y.iloc[49:350]) / len(y.iloc[49:350])
mean1.append(abs_PAR)
newrow = {0:abs_PAR}
df1 = df1.append(newrow)
print(newrow)
writer = ExcelWriter('df1.xlsx')
df1.to_excel(writer,'Sheet1',index=False)
writer.save()
Error message:
AttributeError: 'NoneType' object has no attribute 'append'
Thank you in advance

You should not use the assign operator (=) with append. Try changing
df1 = df1.append(newrow)
to
df1.append(newrow)

From dictionary to organized Excel

Good Morning,
I have a dictionary: organized this way below; And What I want to do is use the dictionary values as the column number for the Key. As showed below:
My first idea was to loop through the dictionary and create a text file where dico_values = tabs and then transform this new file into an excel file but this seems one too much step. Cheers to all

You could perhaps try this:
new_dico = {value: [] for value in dico.values()} # {1: [], 2: [], 3: [], ...}
for key, value in dico.items():
new_dico[value].append(key)
for otherkey in new_dico.keys():
if otherkey == value:
continue
else:
new_dico[otherkey].append("")
# new_dico == {1: ["President of ...", "Members of ..", "", ...], 2: ["", "", "President's Office", ...], ...}
# Then you can make a dataframe of 'new_dico' with pandas, for instance

How to avoid "missing input files" error in Snakemake's "expand" function

I get a MissingInputException when I run the following snakemake code:
import re
import os
glob_vars = glob_wildcards(os.path.join(os.getcwd(), "inputs","{fileName}.{ext}"))
rule end:
input:
expand(os.path.join(os.getcwd(), "inputs", "{fileName}_rename.fas"), fileName=glob_vars.fileName)
rule rename:
'''
rename fasta file to avoid problems
'''
input:
expand("inputs/{{fileName}}.{ext}", ext=glob_vars.ext)
output:
os.path.join(os.getcwd(), "inputs", "{fileName}_rename.fas")
run:
list_ = []
with open(str(input)) as f2:
line = f2.readline()
while line:
while not line.startswith('>') and line:
line = f2.readline()
fas_name = re.sub(r"\W", "_", line.strip())
list_.append(fas_name)
fas_seq = ""
line = f2.readline()
while not line.startswith('>') and line:
fas_seq += re.sub(r"\s","",line)
line = f2.readline()
list_.append(fas_seq)
with open(str(output), "w") as f:
f.write("\n".join(list_))
My Inputs folder contains these files:
G.bullatarudis.fasta
goldfish_protein.faa
guppy_protein.faa
gyrodactylus_salaris.fasta
protopolystoma_xenopodis.fa
salmon_protein.faa
schistosoma_mansoni.fa
The error message is:
Building DAG of jobs...
MissingInputException in line 10 of /home/zhangdong/works/NCBI/BLAST/RHB/test.rule:
Missing input files for rule rename:
inputs/guppy_protein.fasta
inputs/guppy_protein.fa
I assumed that the error is caused by expand function, because only guppy_protein.faa file exists, but expand also generate guppy_protein.fasta and guppy_protein.fa files. Are there any solutions?

By default, expand will produce all combinations of the input lists, so this is expected behavior. You need your input to lookup the proper extension given a fileName. I haven't tested this:
glob_vars = glob_wildcards(os.path.join(os.getcwd(), "inputs","{fileName}.{ext}"))
# create a dict to lookup extensions given fileNames
glob_vars_dict = {fname: ex for fname, ex in zip(glob_vars.fileName, glob_vars.ext)}
def rename_input(wildcards):
ext = glob_vars_dict[wildcards.fileName]
return f"inputs/{wildcards.fileName}.{ext}"
rule rename:
input: rename_input
A few unsolicited style comments:
You don't have to prepend your glob_wildcards with the os.getcwd, glob_wildcards("inputs", "{fileName}.{ext}")) should work as snakemake uses paths relative to the working directory by default.
Try to stick with snake_case instead of camalCase for your variable names in python
In this case, fileName isn't a great descriptor of what you are capturing. Maybe species_name or species would be clearer

Thanks to Troy Comi, I modified my code and it worked:
import re
import os
import itertools
speciess,exts = glob_wildcards(os.path.join(os.getcwd(), "inputs_test","{species}.{ext}"))
rule end:
input:
expand("inputs_test/{species}_rename.fas", species=speciess)
def required_files(wildcards):
list_combination = itertools.product([wildcards.species], list(set(exts)))
exist_file = ""
for file in list_combination:
if os.path.exists(f"inputs_test/{'.'.join(file)}"):
exist_file = f"inputs_test/{'.'.join(file)}"
return exist_file
rule rename:
'''
rename fasta file to avoid problems
'''
input:
required_files
output:
"inputs_test/{species}_rename.fas"
run:
list_ = []
with open(str(input)) as f2:
line = f2.readline()
while line:
while not line.startswith('>') and line:
line = f2.readline()
fas_name = ">" + re.sub(r"\W", "_", line.replace(">", "").strip())
list_.append(fas_name)
fas_seq = ""
line = f2.readline()
while not line.startswith('>') and line:
fas_seq += re.sub(r"\s","",line)
line = f2.readline()
list_.append(fas_seq)
with open(str(output), "w") as f:
f.write("\n".join(list_))

youtube_dl video descriptions

I have a df containing a set of videoIDs from YT:
import pandas as pd
data = {'Order': ['1', '2', '3'],
'VideoID': ['jxwHmAoKte4', 'LsXM502SpiU','1I3f27iQ4pM']
}
df = pd.DataFrame (data, columns = ['Order','VideoID'])
print (df)
and want to download the video descriptions and save them in the same df in an extra column.
I tried to use youtube_dl in Jupyter this way:
import youtube_dl
def all_descriptions(URL):
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
try:
youtube_dl.YoutubeDL(ydl_opts).download(URL)
return webpage
except:
pass
df['descriptions']=all_descriptions(URL)
I see the output of the code as text, but in df only "None" as text of the column.
Obviously I can't transport the output of the function to df in the proper way.
Can you suggest how to get it right?
Thank you in advance for help.

#perl
I modify the df to include two URLs that are causing two types of error:
import pandas as pd
data = {'Order': ['1', '2', '3', '4', '5'],
'VideoId': ['jxwHmAoKte4', 'LsXM502SpiU','1I3f27iQ4pM', 'MGQOX2rK5s', 'wNayw_E7lIA']
}
df = pd.DataFrame (data, columns = ['Order','VideoId'])
print (df)
Then I test it in the way you suggested, including my definition of ydl_opts:
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
df['description'] = [
youtube_dl.YoutubeDL(ydl_opts).extract_info(
u, download=False)['description'] for u in URL]
df
Reaching to the first error I get the output:
TypeError: 'NoneType' object is not subscriptable
After that I replace 'forcedescription' in my code with 'extract_info':
def all_descriptions(URL):
videoID=df['VideoId']
URL = 'https://www.youtube.com/watch?v=' + videoID
ydl_opts = {
'forcedescription':True,
'skip_download': True,
'youtube-skip-dash-manifest': True,
'no_warnings': True,
'ignoreerrors': True
}
try:
youtube_dl.YoutubeDL(ydl_opts).download(URL)
return webpage
except:
pass
It skips all errors, but as the result there is nothing in the 'description'-column.
Any sugggestions?

You can use extract_info method:
df['description'] = [
youtube_dl.YoutubeDL().extract_info(
u, download=False)['description'] for u in URL]
df
Output:
Order VideoID description
0 1 jxwHmAoKte4 Bundesweit gelten sie nun ab heute, die schärf...
1 2 LsXM502SpiU Wie sicher ist der Impfstoff? Wäre eine Impfpf...
2 3 1I3f27iQ4pM Impfen ja oder nein, diese Frage stellen sich ...
P.S. The forcedescription parameter only prints the description to standard output, it doesn't return it
Update: extract_info returns None if it fails, so in case we have videos that may fail before getting the description from the info we can check that the info is not None:
ydl = youtube_dl.YoutubeDL(ydl_opts)
infos = [ydl.extract_info(u, download=False) for u in URL]
df['description'] = [
info['description'] if info is not None else ''
for info in infos]

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

UnboundLocalError in Pandas - pandas

Related

Handling errors within loops through exceptions

I cannot save into excel the result of entire loop but only the last iteration

From dictionary to organized Excel

How to avoid "missing input files" error in Snakemake's "expand" function

youtube_dl video descriptions

Categories

Resources