To read csv in google colaboratore - google-colaboratory

from google.colab import files
uploaded = files.upload()
import io
def ls(ruta = uploaded):
return [arch.name for arch in io.StringIO((ruta)) if arch.is_file()]
divisas = ls()
I have this error:
TypeError: initial_value must be str or None, not dict

from google.colab import files
uploaded = files.upload()
Import the google.colab library for file upload then upload the file and pass file name inside the pandas read_csv function
import io
import pandas as pd
df2 = pd.read_csv(io.BytesIO(uploaded['heart.csv']))
df2.head()

I need to include the file names in divisas list

Related

how to solve( Unsupported format, or corrupt file: Expected BOF record; found ) error?

import pandas as pd
import xlrd
import os
import matplotlib.pylab as plt
file_folder_address = 'C:/Users/Amirreza/Desktop/python homeworks/project files'
df_total=pd.DataFrame()
for file in os.listdir(file_folder_address): #os.listdir gives a list of exel file names
df_men_urb = pd.DataFrame()
df_women_urb = pd.DataFrame()
df_men_rural = pd.DataFrame()
df_women_rural = pd.DataFrame()
sheet_names = pd.ExcelFile(os.path.join(file_folder_address, file), engine="xlrd").sheet_names
`
when I use this cod make above error . what should I do ?

How to create multiple pandas profiling reports for multiple csv files in a directory? The report name should match the file name

I tried this,
import glob
import os
import pandas as pd
import pandas_profiling
from pandas_profiling import ProfileReport
files = glob.glob("D:\home_health_services_current_data\*.csv")
df = pd.DataFrame()
for f in files:
csv = pd.read_csv(f)
df = df.append(csv)
profile = ProfileReport(df, title="Profiling Report", explorative=True)
profile.to_file("D:\proj_report\profilerep\prof_report.html")

Pandas read_csv failing on gzipped file with OSError: Not a gzipped file (b'NU')

I used the code ask below to load the csv.gz file but I got the error
OSError: Not a gzipped file (b'NU')
How can I solve it?
Code:
import pandas as pd
data = pd.read_csv('climat.202010.csv.gz', compression='gzip')
print(data)
Or:
import gzip
import pandas as pd
filename = 'climat.202010.csv.gz'
with gzip.open(filename, 'rb') as f:
data = pd.read_csv(f)
Try
import gzip
with gzip.open(filename, 'rb') as fio:
df = pd.read_csv(fio)
This works for me:
import gzip
import pandas as pd
with gzip.open(r'C:\Users\MyUser\OneDrive - Company\Data\Wiser\Files\WiserWeeklyReport.csv.gz') as f:
wiser_report = pd.read_csv(f)
wiser_report.head()
If you're still getting an error, it may be the file or the file name. Have you tried taking out the extra period in the file name?

read csv file from buffer got EmptyDataError?

i need to read a string like csv content with pandas , but pandas get some errors, i don't knonw what happened, can anyone help me?
import pandas as pd
import io
s = ',测试项,信息,结果\r\n0,软件测试机型805,软件测试机型805,PASS\r\n1,软件当前版本1,软件当前版本1,FAIL\r\n2,软件测试机型805,软件测试机型805,PASS\r\n3,软件当前版本1,软件当前版本1,FAIL\r\n4,软件测试机型805,软件测试机型805,PASS\r\n5,软件当前版本1,软件当前版本1,FAIL\r\n'
buf = io.StringIO()
buf.write(s)
df = pd.read_csv(buf)
got error, EmptyDataError: No columns to parse from file
老铁你拿去
import pandas as pd
import io
s = ',测试项,信息,结果\r\n0,软件测试机型805,软件测试机型805,PASS\r\n1,软件当前版本1,软件当前版本1,FAIL\r\n2,软件测试机型805,软件测试机型805,PASS\r\n3,软件当前版本1,软件当前版本1,FAIL\r\n4,软件测试机型805,软件测试机型805,PASS\r\n5,软件当前版本1,软件当前版本1,FAIL\r\n'
buf = io.StringIO()
buf.write(s)
buf.seek(0)
df = pd.read_csv(buf)
``

Very poor speed/memory performance when using tf.data for text data

I am trying to use this code
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/5_word2vec.ipynb
But use tf.data to handle all the text data.
I am running both over Google Colaboratory using the GPU runtime option.
Here's is the original relevant code
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
url = 'http://mattmahoney.net/dc/'
def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified %s' % filename)
else:
print(statinfo.st_size)
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
filename = maybe_download('text8.zip', 31344016)
def read_data(filename):
"""Extract the first file enclosed in a zip file as a list of words"""
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
print('Data size %d' % len(words))
Here is my best attempt to alter this to use tf.data to handle the text data.
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
url = 'http://mattmahoney.net/dc/'
def maybe_download(filename, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified %s' % filename)
else:
print(statinfo.st_size)
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
filename = maybe_download('text8.zip', 31344016)
def read_data(filename):
"""Extract the first file enclosed in a zip file as a list of words"""
with zipfile.ZipFile(filename) as f:
data = tf.data.Dataset.from_tensor_slices( f.read(f.namelist()[0]) )
return data
datasetTest = read_data(filename)
Basically I changed this line from the original
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
to this line
data = tf.data.Dataset.from_tensor_slices( f.read(f.namelist()[0]) )
The new one doesn't split the words by space like the old line, so I had another line #datasetTest = datasetTest.map(lambda string: tf.string_split([string]).values) but I commented it out to try to pin point where the bottle neck is.
The old code runs within a minute or two. The new one can never finish executing. It usually runs for 30-40 minutes before colab says it has crashed and is restarting the runtime.