Arranging data into lists from url - urllib2

The following code is written in python 2. How can I write it in python 3? thanks
import urllib2
import sys
#read data from uci data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data")
data = urllib2.urlopen(target_url)
#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
#split on comma
row = line.strip().split(",")
xList.append(row)

You can use the requests library of Python 3.
import requests
data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data").text
for line in data.split('\n'):
row = line.strip().split(",")
xList.append(row)

Related

choropleth plotly map displaying a white background

I am trying to create a choropleth map of the uk using plotly, but every time I try, it outputs an empty page, or the json doesn't match with the dataframe.this is where i obtained the url for the dataframe Here's my code so far:
import pandas as pd
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json') as response:
geojson = json.load(response)
url3 = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
df = pd.read_csv(url3)
df_new=df.replace("areaName", "NAME_2")
from plotly import graph_objects as go
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaCode"],
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True,
)
)
fig.show()
a few things to fix this up:
uk-counties.json is in topojson format, plotly needs a geojson. can fix with the topojson module, for example (or geopandas)
no need to replace "areaName", you want this: locations=df["areaName"]
you need to specify a marker_style. centering and zooming help as well
for good result you need to use only one day's worth of data per choropleth, hence the df = df[df['date'] == '2022-11-23']
the covid data and the topojson don't match up well by districts, so there are gaps in the map
code:
"""
https://stackoverflow.com/questions/71828342/choropleth-plotly-map-displaying-a-white-background
"""
from urllib.request import urlretrieve
import json
from io import StringIO
from plotly import graph_objects as go
import pandas as pd
import topojson as tp
URL_JSON = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json'
URL_DATA = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
CSV_DATA = 'uk_covid.csv'
TOPO_DATA = 'topojson.json'
GEO_DATA = 'geojson.json'
def download():
urlretrieve(URL_JSON, TOPO_DATA)
with open(TOPO_DATA, 'r') as data:
topoJSON = json.load(StringIO(data.read()))
topo = tp.Topology(topoJSON, object_name='GBR_adm2')
# convert to geojson, store in GEO_DATA
topo.to_geojson(GEO_DATA)
df = pd.read_csv(URL_DATA)
df.to_csv(CSV_DATA)
def make_map():
df = pd.read_csv(CSV_DATA)
with open(GEO_DATA, 'r') as data:
geojson = json.load(StringIO(data.read()))
# one day at a time
df = df[df['date'] == '2022-11-23']
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaName"], # <=== not areaCode
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True
)
)
# need a mapbox_style
fig.update_layout(mapbox_style='carto-positron',
mapbox_zoom=5,
mapbox_center_lon=-2.057852,
mapbox_center_lat=53.404854,
height=700,
width=700)
fig.show()
if 0: # only needed once
download()
make_map()

How to find the code that generates graphs?

Here is the code sample that you can find in the 'https://realpython.com/'
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
data = pd.read_csv("avocado.csv")
data = data.query("type == 'conventional' and region == 'Albany'")
data["Date"] = pd.to_datetime(data["Date"], format="%Y-%m-%d")
data.sort_values("Date", inplace=True)
app = dash.Dash(__name__)
However, when I run this code, two graphs show up irrelevantly.
Where can I find the codes that generate these graphs? Why do these graphs occur?

Combining CSV of different shapes into one CSV

I have CSVs of different number of rows and columns. I would like to create one large CSV where all the CSV data are stacked directly on top of each other, aligned by the first column. I tried the script below with limited success; b which is an empty array does not hold the data from the previous loops.
from os import walk
import sys
import numpy as np
filenames= []
dirpath = []
filtered = []
original = []
f = []
b = np.empty([2, 2])
for (dirpath, dirnames, filenames) in walk("C:\\Users\\dkim1\\Python Scripts\\output"):
f.extend(dirnames)
print(f)
for names in f:
print(names)
df = np.genfromtxt('C:\\Users\\dkim1\\Python Scripts\\output\\' + names + '\\replies.csv', dtype =None, delimiter = ',', skip_header=1, names=True)
b = np.column_stack(df)
print(b)
Have you tried pd.concat()?
import os
import pandas as pd
# just used a single dir for example simplicity, rather than os.walk()
root_dir = "your directory path here"
file_names = os.listdir(root_dir)
cat_list=[]
for names in file_names:
df = pd.read_csv(os.path.join(root_dir, names), delimiter = ',', header=None)
cat_list.append(df)
concatted_df = pd.concat(cat_list)

Bokeh Server: import .csv file with FileInput widget and pass it to ColumnDataSource

I have a csv file with my data to plot (x, y, and other fields) and want to import it using the new FileInput widget. I don't have sufficient knowledge to manipulate the "base64" strings coming from FileInput to pass it to a ColumnDataSource of dataframe.
from bokeh.io import curdoc
from bokeh.models.widgets import FileInput
def update_cds(attr, old, new):
#A code here to extract column names and data
#from file_input and pass it to a ColumnDataSource or a DataFrame
file_input = FileInput(accept=".csv")
file_input.on_change('value', update_cds)
doc=curdoc()
doc.add_root(file_input)
Thanks for your help!
Here is a working solution: the code will upload the csv file on the server side in a 'data' folder (to be created before). Then it is easy to open the csv and pass it to a ColumnDataSource for instance.
#widget
file_input = FileInput(accept=".csv")
def upload_csv_to_server(attr, old, new):
#decode base64 format (from python base24 website)
base64_message = file_input.value
base64_bytes = base64_message.encode('ascii')
message_bytes = base64.b64decode(base64_bytes)
message = message_bytes.decode('ascii')
#convert string to csv and save it on the server side
message_list = message.splitlines()
with open('data/' + file_input.filename, 'w', newline='') as file:
writer = csv.writer(file)
for i in range(len(message_list)):
writer.writerow(message_list[i].split(','))
file_input.on_change('value', upload_csv_to_server)
If you see a nicer way please let me know. It is easy with the csv structure to do that way but what about any other file format?
Python has a built in base64 standard library module:
import base64
data = base64.b64decode(encoded)

When using Pandas to_hdf is it possible to specify a column data type to vlen special_dtype / vlarray for ragged tensors?

I have a Pandas column which contains numpy arrays or lists of varying size. If I try to convert the dataframe to hdf5 using to_hdf , I get the message that says
PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values]
I am guessing this because of the ragged tensors in pandas column. HDpy does have a special datatype for ragged tensors.
http://docs.h5py.org/en/stable/special.html#arbitrary-vlen-data
Example here
h5f = h5py.File('data.h5', 'w')
dt = h5py.special_dtype(vlen=np.dtype('int32'))
h5f.create_dataset('batch', data=yourData, dtype=dt, compression='gzip', compression_opts=9)
So I can convert the pandas df to numpy, and then save each numpy array separately, with the varying length column stored with the special vlen datatype.
I am wondering if there is a way to do this in Pandas.
The following is a minimal example using a small chunk of my data. It downloads and opens a small chunk of the dataframe, and saves it to hdf5
import requests
import pickle
import numpy as np
import pandas as pd
#Download function for google drive
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#download the google drive file
download_file_from_google_drive('1-0R28Yhdrq2QWQ-4MXHIZUdZG2WZK2qR', 'sample.pkl')
sampleDF2 = pd.read_pickle('sample.pkl')
sampleDF2.to_hdf( 'pandasList.hdf', 'first', complevel = 9 )
sampleDF2['totalCites2'] = sampleDF2['totalCites2'].apply(lambda x: np.array(x))
sampleDF2.to_hdf( 'pandasNumpy.hdf', 'first', complevel = 9 )
For convenience, here is a colab notebook which has this code
https://colab.research.google.com/drive/1DjiPsN3MbRWP6NnJwvaAhzy66FNbPVA8
Edit:
As hpualj mentioned, Pandas uses Pytables not h5py, so it looks like the question should be how to use vlarray, which is how pytables store variable length arrays.