Where is DataFrame() after moving from panda.io.data to pandas_datareader?

Where is DataFrame() after moving from panda.io.data to pandas_datareader? - pandas

installed python/pandas in a new PC,
Successfully installed pandas-datareader-0.2.1 requests-file-1.4.1
But the old code is not working after replacing pandas.io with pandas_datareader.
import pandas_datareader.data as web
import datetime
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2013, 1, 27)
f = web.DataReader("F", 'yahoo', start, end)
columns = ['Open', 'High', 'Low', 'Close', 'DateIdx']
diDian = web.DataFrame(columns=columns)
Get this,
File "delme1.py", line 9, in
diDian = web.DataFrame(columns=columns)
AttributeError: 'module' object has no attribute 'DataFrame'
How to fix this please ?

Ok this works
import pandas_datareader.data as web
import datetime
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2013, 1, 27)
f = web.DataReader("F", 'yahoo', start, end)
f['DateIdx'] = 0
columns = ['Open', 'High', 'Low', 'Close', 'DateIdx']
diDian = f[columns]

Related

Unable to Group dataframe by Month number

I have the following code but it seems the line
cs.groupby(cs['Disbursal_Date'].dt.strftime('%B'))['Revenue'].sum()
just returns the entire dataframe without the data grouping by Month number.
Any help is much appreciated
import pandas as pd
import os
import glob
import numpy as np
os.chdir("C:/csv/")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
cs = pd.concat([pd.read_csv(f) for f in all_filenames])
cs.drop(cs.columns[[0]], axis=1, inplace=True)
cs = cs[cs["Booked"] == 1]
cs['Disbursal_Date'] = pd.to_datetime(cs['Disbursal_Date'])
cs.drop_duplicates(inplace=True)
cs['Revenue'] = np.where(cs['Loan_Amount'] < 1000, 28,
np.where((cs['Loan_Amount'] > 1000) & (cs['APR'] < 0.3), 0.0525 * cs['Loan_Amount'],
np.where((cs['Loan_Amount'] > 1000) & (cs['APR'] > 0.3), 0.0275 * cs['Loan_Amount'], 0)))
cs.loc[cs.Revenue >= 175, "Revenue"] = 175
cs.loc[cs.Revenue <= 52.50, "Revenue"] = 52.50
cs.groupby(cs['Disbursal_Date'].dt.strftime('%B'))['Revenue'].sum()
print(cs)

You're not assigning the result from your cs.groupby. Something like:
cs = cs.groupby(cs['Disbursal_Date'].dt.strftime('%B'))['Revenue'].sum()
print(cs)
Should do the trick.

AttributeError:'str' object has no attribute 'unique' (Pandas.unique)

In my script, I use pandas module. When I execute my file.py - everything works well. But I've converted my file.py to file.exe with auto-py-to-exe and got an error: AttributeError:'str' object has no attribute 'unique'. It's strange because it worked normally. The line where becomes an error: wells=list(file[0].unique()). Who knows this issue, please help.
import tkinter as tk
import tkinter.filedialog as fd
import pandas as pd
import os
import datetime
from datetime import datetime, date
import numpy as np
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 800)
def resource_path(relative_path):
try:
base_path = sys._MEIPASS
except Exception:
base_path = os.path.abspath(".")
return os.path.join(base_path, relative_path)
def open():
global file_excel, name
file_excel = fd.askopenfilename(initialdir='/Desktop', title='Открыть файл', filetypes = [("Excel", "*.xlsx")])
name = os.path.basename(file_excel)
name=os.path.splitext(name)[0]
file_excel=pd.read_excel(file_excel, skiprows=[0], header=None)
win.destroy()
return file_excel, name
win = tk.Tk()
path = resource_path("image.png")
photo = tk.PhotoImage(file=path)
win.iconphoto(False, photo)
win.config(bg='#FFC')
win.title('Конвертация в формат .ev')
win.geometry('400x130+500+500')
win.resizable(False, False)
label_1 = tk.Label(win, text = 'Выберите файл с испытаниями скважин:',
bg = '#FFC',
font=('Arial', 10, 'bold'),
padx=20,
pady=10).pack()
btn_1 = tk.Button(win, text = 'Выбрать Excel',
command = open,
activebackground = '#6F6',
font=('Arial', 12, 'bold'),
padx=20,
pady=10,
relief = tk.RAISED,
bd=2).pack()
win.mainloop()
wells=list(file_excel[0].unique())
file_excel[1] = pd.to_datetime(file_excel[1], errors='coerce').dt.strftime("%d/%m/%Y")
file_excel[4] = np.where(file_excel[1].str, 'Perforation', np.nan)
file_excel.iloc[:,[2,3]]=file_excel.iloc[:,[2,3]].abs()
col_list = list(file_excel)
col_list[4], col_list[2] = col_list[2], col_list[4]
file_excel.columns = col_list
Perforation=pd.DataFrame(data=None)
for i in wells:
well_name=pd.DataFrame({'WELLNAME '+i}, columns=[1])
Perforation=Perforation.append(well_name)
Perforation=Perforation.append(file_excel.iloc[:,[1,2,3,4]][file_excel.iloc[:,0]==i])
Perforation=Perforation.append(pd.Series(dtype = 'object'), ignore_index=True)
def SaveFile():
Save=fd.asksaveasfile(mode='w',defaultextension=".ev", initialfile=name)
Save.write(Perforation.to_string(index=False, header=False, na_rep=' '))
win.destroy()
win = tk.Tk()
path = resource_path("image.png")
photo = tk.PhotoImage(file=path)
win.iconphoto(False, photo)
win.config(bg='#FFC')
win.title('Конвертация в формат .ev')
win.geometry('400x130+500+500')
win.resizable(False, False)
label_1 = tk.Label(win, text = 'Сохранение:',
bg = '#FFC',
font=('Arial', 10, 'bold'),
padx=20,
pady=10).pack()
btn_1 = tk.Button(win, text = 'Сохранить как',
command = SaveFile,
activebackground = '#6F6',
font=('Arial', 12, 'bold'),
padx=20,
pady=10,
relief = tk.RAISED,
bd=2).pack()
win.mainloop()
type of file[0]
Error screen

When I created virtual env I should have added openpyxl module. And I made it and everything is fine now

How to merge pandas DF on imperfect match?

I'm trying to merge/join x and y dataframes based on an exact match of the company columns and a partial match of some degree on the name columns.
Other than looking at the values returned by SequenceMatcher(None, x_name, y_name).ratio(), which were always above .8 in my case, I haven't tried much that warrants mentioning.
x = pd.DataFrame([{'id': 1, 'name': 'Robert Jackson', 'company': 'Test inc.', 'tenure': 6},
{'id': 2, 'name': 'William Johnson', 'company': 'Test inc.', 'tenure': 6}]).set_index('id')
y = pd.DataFrame([{'id': 4, 'name': 'Bob Jackson', 'company': 'Test inc.', 'job': 'desk'},
{'id': 5, 'name': 'Willy Johnson', 'company': 'Test inc.', 'job': 'desk'}]).set_index('id')
goal = pd.DataFrame([{'x_id': 1, 'y_id': 4, 'x_name': 'Robert Jackson', 'y_name': 'Bob Jackson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'},
{'x_id': 2, 'y_id': 5, 'x_name': 'William Johnson', 'y_name': 'Willy Johnson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'}])
Is something like this plausible? I'd appreciate any feedback, thank you.

Great question! I'm following to see other answers as I've been doing a lot of similar work lately. One inefficient method I've taken is to use fuzzywuzzy based on a threshold.
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
s = df_2[key2].tolist()
m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
df_1['matches'] = m
m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
df_1['matches'] = m2
return df_1

The solution that I used was:
from difflib import SequenceMatcher
x['merge_name'] = x['name']
x['merge_comp'] = x['company']
for a, b in x[['name', 'company']].values:
for ixb, (c,d) in enumerate(y[['name', 'company']].values):
if SequenceMatcher(None,a,c).ratio() >= .8:
y.loc[ixb,'merge_name'] = a
if SequenceMatcher(None,b,d).ratio() == 1:
y.loc[ixb,'merge_comp'] = b
goal = pd.merge(x,y, on=['merge_name', 'merge_comp'])
This function worked while passing arbitrary number of columns:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=False, post_drop=True):
if reset_index:
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
flag = 0
merge_columns = []
r = len(columns)
for f in range(r):
df1[prefix + columns[flag]] = df1[columns[flag]]
merge_columns.append(prefix + columns[flag])
flag =+ 1
flag = 0
for f in range(r):
for col_1 in df1[columns[flag]].values:
for index, col_2 in enumerate(df2[columns[flag]].values):
print(type(col_2))
if SequenceMatcher(None,str(col_1),str(col_2)).ratio() >= ratios[flag]:
df2.loc[index, merge_columns[flag]] = col_1
flag =+ 1
df = pd.merge(df1,df2, on=merge_columns)
if post_drop:
df1.drop(columns=merge_columns, inplace=True)
df2.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x, y, columns=['name', 'company'], ratios=[.8, 1], reset_index=True)
This function worked for passing exactly 2 columns/ratios:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=True, post_drop=True):
df1_c = df1.copy()
df2_c = df2.copy()
if reset_index:
df1_c.reset_index(inplace=True)
df2_c.reset_index(inplace=True)
df1_c[prefix + columns[0]] = df1_c[columns[0]]
df1_c[prefix + columns[1]] = df1_c[columns[1]]
merge_columns = [prefix + columns[0], prefix + columns[1]]
for col_1, col_2 in df1_c[[columns[0], columns[1]]].values:
for index, (col_3, col_4) in enumerate(df2_c[[columns[0], columns[1]]].values):
if SequenceMatcher(None, str(col_1), str(col_3)).ratio() >= ratios[0]:
df2_c.loc[index, merge_columns[0]] = col_1
if SequenceMatcher(None, str(col_2), str(col_4)).ratio() >= ratios[1]:
df2_c.loc[index, merge_columns[1]] = col_2
df = pd.merge(df1_c, df2_c, on=merge_columns)
if post_drop:
df.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x,y,columns=['name', 'company'], ratios=[.8,1])

Only the string 'symbols' is supported for Nasdaq

import pandas_datareader as web
import datetime
start = datetime.datetime(2015, 1, 1)
end = datetime.datetime(2018, 7, 30)
f = web.DataReader("AAPL", "nasdaq", start, end)
I run this code and the result is like this
File "/anaconda3/lib/python3.6/site-packages/pandas_datareader/data.py", line 377, in DataReader
"Nasdaq, not %r" % (name,))
ValueError: Only the string 'symbols' is supported for Nasdaq, not 'AAPL'
so how can I fix this one please help.

pandas xlsxwriter stacked barchart

I am looking to upload a grouped barchart in excel, however I can't seem to find a way to do so.
Here is my code:
bar_chart2 = workbook.add_chart({'type':'column'})
bar_chart2.add_series({
'name':'Month over month product',
'categories':'=Month over month!$H$2:$H$6',
'values':'=Month over month!$I$2:$J$6',
})
bar_chart2.set_legend({'none': True})
worksheet5.insert_chart('F8',bar_chart2)
bar_chart2.set_legend({'none': True})
worksheet5.insert_chart('F8',bar_chart2)
However, I get that.

Using your provided data, I re-worked the Example given in the Docs by jmcnamara (link here) to suit what you're looking for.
Full Code:
import pandas as pd
import xlsxwriter
headings = [' ', 'Apr 2017', 'May 2017']
data = [
['NGN', 'UGX', 'KES', 'TZS', 'CNY'],
[5816, 1121, 115, 146, 1],
[7089, 1095, 226, 120, 0],
]
#opening workbook
workbook = xlsxwriter.Workbook("test.xlsx")
worksheet5 = workbook.add_worksheet('Month over month')
worksheet5.write_row('H1', headings)
worksheet5.write_column('H2', data[0])
worksheet5.write_column('I2', data[1])
worksheet5.write_column('J2', data[2])
# beginning of OP snippet
bar_chart2 = workbook.add_chart({'type':'column'})
bar_chart2.add_series({
'name': "='Month over month'!$I$1",
'categories': "='Month over month'!$H$2:$H$6",
'values': "='Month over month'!$I$2:$I$6",
})
bar_chart2.add_series({
'name': "='Month over month'!$J$1",
'categories': "='Month over month'!$H$2:$H$6",
'values': "='Month over month'!$J$2:$J$6",
})
bar_chart2.set_title ({'name': 'Month over month product'})
bar_chart2.set_style(11)
#I took the liberty of leaving the legend in there - it was commented in originally
#bar_chart2.set_legend({'none': True})
# end of OP snippet
worksheet5.insert_chart('F8', bar_chart2)
workbook.close()
Output:

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Where is DataFrame() after moving from panda.io.data to pandas_datareader? - pandas

Ok this works import pandas_datareader.data as web import datetime start = datetime.datetime(2010, 1, 1) end = datetime.datetime(2013, 1, 27) f = web.DataReader("F", 'yahoo', start, end) f['DateIdx'] = 0 columns = ['Open', 'High', 'Low', 'Close', 'DateIdx'] diDian = f[columns]

Related

Unable to Group dataframe by Month number

AttributeError:'str' object has no attribute 'unique' (Pandas.unique)

How to merge pandas DF on imperfect match?

Only the string 'symbols' is supported for Nasdaq

pandas xlsxwriter stacked barchart

Categories

Resources