How can i download a zipped file from the internet using pandas 0.17.1 and python 3.5 - pandas

What am i doing wrong? here is what i am trying to do:
import pandas as pd
url='http://data.octo.dc.gov/feeds/crime_incidents/archive/crime_incidents_2013_CSV.zip'
df = pd.read_csv(url, compression='gzip',
header=0, sep=',', quotechar='"',
engine = 'python')

#Abbas, thanks so much. Indeed i ran it step by step and here is what i came up with. Not the fastest indeed, but it works fine.
I ran it with pandas 0.18.1 on python 3.5.1 on Mac
from zipfile import ZipFile
from urllib.request import urlopen
import pandas as pd
import os
URL = \
'http://data.octo.dc.gov/feeds/crime_incidents/archive/crime_incidents_2013_CSV.zip'
# open and save the zip file onto computer
url = urlopen(URL)
output = open('zipFile.zip', 'wb') # note the flag: "wb"
output.write(url.read())
output.close()
# read the zip file as a pandas dataframe
df = pd.read_csv('zipFile.zip') # pandas version 0.18.1 takes zip files
# if keeping on disk the zip file is not wanted, then:
os.remove(zipName) # remove the copy of the zipfile on disk
I hope this helps. Thanks!

The answer by Cy Bu didn't quite work for me in Python 3.6 on Windows. I was getting an invalid argument error when trying to open the file. I modified it slightly:
import os
from urllib.request import urlopen, Request
r = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
b2 = [z for z in url.split('/') if '.zip' in z][0] #gets just the '.zip' part of the url
with open(b2, "wb") as target:
target.write(urlopen(r).read()) #saves to file to disk
data = pd.read_csv(b2, compression='zip') #opens the saved zip file
os.remove(b2) #removes the zip file

IIUC here is a solution instead of directly passing zip file to pandas, first unzip it and then pass the csv file:
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
import pandas as pd
url = urlopen("http://data.octo.dc.gov/feeds/crime_incidents/archive/crime_incidents_2013_CSV.zip")
zipfile = ZipFile(StringIO(url.read()))
f = open(zipfile.NameToInfo.keys()[0],'wb')
f.write(zipfile.open(zipfile.NameToInfo.keys()[0]).read())
f.close()
df = pd.read_csv(zipfile.NameToInfo.keys()[0])
And will produce a DataFrame like this:

Related

Import multiple files in pandas

I am trying to import multiple files in pandas. I have created 3 files in the folder
['File1.xlsx', 'File2.xlsx', 'File3.xlsx'] as read by files = os.listdir(cwd)
import os
import pandas as pd
cwd = os.path.abspath(r'C:\Users\abc\OneDrive\Import Multiple files')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
df = df.append(pd.read_excel(file), ignore_index=True)
df.head()
# df.to_excel('total_sales.xlsx')
print (files)
Upon running the code, I am getting the error (even though the file does exist in the folder)
FileNotFoundError: [Errno 2] No such file or directory: 'File1.xlsx'
Ideally, I want a code where I define a list of files in a LIST and then read the files through the loop using the path and the file LIST.
I think the following should work
import os
import pandas as pd
cwd = os.path.abspath(r'C:\Users\abc\OneDrive\Import Multiple files')
paths = [os.path.join(cwd,path) for path in os.listdir(cwd) if path.endswith('.xlsx')]
df = pd.concat(pd.read_excel(path,ignore_index=True) for path in paths)
df.head()
The idea is to get a list of full paths and then read them all in and concatenate them into a single dataframe on the next line

Pandas - xls to xlsx converter

I want python to take ANY .xls file from given location and save it as .xlsx with original file name? How I can do that so anytime I paste file to location it will be converted to xlsx with original file name?
import pandas as pd
import os
for filename in os.listdir('./'):
if filename.endswith('.xls'):
df = pd.read_excel(filename)
df.to_excel(??)
Your code seems to be perfectly fine. In case you are only missing the correct way to write it with the given name, here you go.
import pandas as pd
import os
for filename in os.listdir('./'):
if filename.endswith('.xls'):
df = pd.read_excel(filename)
df.to_excel(f"{os.path.splitext(filename)[0]}.xlsx")
A possible extension to convert any file that gets pasted inside the folder can be implemented with an infinite loop, for instance:
import pandas as pd
import os
import time
while True:
files = os.listdir('./')
for filename in files:
out_name = f"{os.path.splitext(filename)[0]}.xlsx"
if filename.endswith('.xls') and out_name not in files:
df = pd.read_excel(filename)
df.to_excel(out_name)
time.sleep(10)

Reading csv in colab errors

I'm trying to import a file to c-lab. I've tried various versions https://buomsoo-kim.github.io/colab/2018/04/15/Colab-Importing-CSV-and-JSON-files-in-Google-Colab.md/
#import packages
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import io
print("Setup Complete")
from google.colab import files
uploaded = files.upload()
# Read the file into a variable power_data
#power_data = pd.read("DE_power prices historical.csv")
data = pd.read_csv('DE_power prices historical.csv', error_bad_lines=False)
Keep getting error:
enter image description here
Try using this method it works a bit easier:
Upload .csv files to your Google Drive
Run the following code in your Colab cell:
from google.colab import drive
drive.mount('/content/drive')
Follow the link the output cell gives you and verify your Gmail account
Import using Pandas like:
power_data = pd.read_csv('/content/drive/My Drive/*filename.csv*')
Mount google drive in google-colab
from google.colab import drive
drive.mount('/content/drive')
copy file path add into URL variable
import pandas as pd
url = 'add copy path your csv file'
df=pd.read_csv(url)
df.head()

Pandas file reader error FileNotFoundError: [WinError 3]

I have the following
import os
import pandas as pd
path = 'C:/PanelComplete/FileForPeter/'
for folder in os.listdir(path):
for file in os.listdir(folder):
df = pd.read_csv(path+folder+'/'+file,engine='python')
df1 = df.groupby('codprg').size().reset_index(name='counts')
df1.to_csv(spath1+folder+'.csv', index=False,encoding='utf-8')
it causes the following problem FileNotFoundError: [WinError 3] The system cannot find the path specified: '20180101'
even the path is right as in the following snipping
This case is asked rapidly but my case is different
The problem is in the second for you are placing the folder name only instead of the full path (path+folder), hence you are not correctly addressing your desired directory. This should work:
import os
import pandas as pd
path = 'C:/PanelComplete/FileForPeter/'
for folder in os.listdir(path):
for file in os.listdir(path+folder):
df = pd.read_csv(path+folder+'/'+file,engine='python')
df1 = df.groupby('codprg').size().reset_index(name='counts')
df1.to_csv(spath1+folder+'.csv', index=False,encoding='utf-8')

Generating a NetCDF from a text file

Using Python can I open a text file, read it into an array, then save the file as a NetCDF?
The following script I wrote was not successful.
import os
import pandas as pd
import numpy as np
import PIL.Image as im
path = 'C:\path\to\data'
grb = [[]]
for fn in os.listdir(path):
file = os.path.join(path,fn)
if os.path.isfile(file):
df = pd.read_table(file,skiprows=6)
grb.append(df)
df2 = pd.np.array(grb)
#imarray = im.fromarray(df2) ##cannot handle this data type
#imarray.save('Save_Array_as_TIFF.tif')
i once used xray or xarray (they renamed them selfs) to get a NetCDF file into an ascii dataframe... i just googled and appearantly they have a to_netcdf function
import xarray and it allows you to treat dataframes just like pandas.
so give this a try:
df.to_netcdf(file_path)
xarray slow to save netCDF