Arranging data into lists from url

Arranging data into lists from url - urllib2

The following code is written in python 2. How can I write it in python 3? thanks
import urllib2
import sys
#read data from uci data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data")
data = urllib2.urlopen(target_url)
#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
#split on comma
row = line.strip().split(",")
xList.append(row)

You can use the requests library of Python 3.
import requests
data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data").text
for line in data.split('\n'):
row = line.strip().split(",")
xList.append(row)

Related

choropleth plotly map displaying a white background

I am trying to create a choropleth map of the uk using plotly, but every time I try, it outputs an empty page, or the json doesn't match with the dataframe.this is where i obtained the url for the dataframe Here's my code so far:
import pandas as pd
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json') as response:
geojson = json.load(response)
url3 = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
df = pd.read_csv(url3)
df_new=df.replace("areaName", "NAME_2")
from plotly import graph_objects as go
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaCode"],
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True,
)
)
fig.show()

a few things to fix this up:
uk-counties.json is in topojson format, plotly needs a geojson. can fix with the topojson module, for example (or geopandas)
no need to replace "areaName", you want this: locations=df["areaName"]
you need to specify a marker_style. centering and zooming help as well
for good result you need to use only one day's worth of data per choropleth, hence the df = df[df['date'] == '2022-11-23']
the covid data and the topojson don't match up well by districts, so there are gaps in the map
code:
"""
https://stackoverflow.com/questions/71828342/choropleth-plotly-map-displaying-a-white-background
"""
from urllib.request import urlretrieve
import json
from io import StringIO
from plotly import graph_objects as go
import pandas as pd
import topojson as tp
URL_JSON = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json'
URL_DATA = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
CSV_DATA = 'uk_covid.csv'
TOPO_DATA = 'topojson.json'
GEO_DATA = 'geojson.json'
def download():
urlretrieve(URL_JSON, TOPO_DATA)
with open(TOPO_DATA, 'r') as data:
topoJSON = json.load(StringIO(data.read()))
topo = tp.Topology(topoJSON, object_name='GBR_adm2')
# convert to geojson, store in GEO_DATA
topo.to_geojson(GEO_DATA)
df = pd.read_csv(URL_DATA)
df.to_csv(CSV_DATA)
def make_map():
df = pd.read_csv(CSV_DATA)
with open(GEO_DATA, 'r') as data:
geojson = json.load(StringIO(data.read()))
# one day at a time
df = df[df['date'] == '2022-11-23']
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaName"], # <=== not areaCode
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True
)
)
# need a mapbox_style
fig.update_layout(mapbox_style='carto-positron',
mapbox_zoom=5,
mapbox_center_lon=-2.057852,
mapbox_center_lat=53.404854,
height=700,
width=700)
fig.show()
if 0: # only needed once
download()
make_map()

How to find the code that generates graphs?

Here is the code sample that you can find in the 'https://realpython.com/'
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
data = pd.read_csv("avocado.csv")
data = data.query("type == 'conventional' and region == 'Albany'")
data["Date"] = pd.to_datetime(data["Date"], format="%Y-%m-%d")
data.sort_values("Date", inplace=True)
app = dash.Dash(__name__)
However, when I run this code, two graphs show up irrelevantly.
Where can I find the codes that generate these graphs? Why do these graphs occur?

Combining CSV of different shapes into one CSV

I have CSVs of different number of rows and columns. I would like to create one large CSV where all the CSV data are stacked directly on top of each other, aligned by the first column. I tried the script below with limited success; b which is an empty array does not hold the data from the previous loops.
from os import walk
import sys
import numpy as np
filenames= []
dirpath = []
filtered = []
original = []
f = []
b = np.empty([2, 2])
for (dirpath, dirnames, filenames) in walk("C:\\Users\\dkim1\\Python Scripts\\output"):
f.extend(dirnames)
print(f)
for names in f:
print(names)
df = np.genfromtxt('C:\\Users\\dkim1\\Python Scripts\\output\\' + names + '\\replies.csv', dtype =None, delimiter = ',', skip_header=1, names=True)
b = np.column_stack(df)
print(b)

Have you tried pd.concat()?
import os
import pandas as pd
# just used a single dir for example simplicity, rather than os.walk()
root_dir = "your directory path here"
file_names = os.listdir(root_dir)
cat_list=[]
for names in file_names:
df = pd.read_csv(os.path.join(root_dir, names), delimiter = ',', header=None)
cat_list.append(df)
concatted_df = pd.concat(cat_list)

Bokeh Server: import .csv file with FileInput widget and pass it to ColumnDataSource

I have a csv file with my data to plot (x, y, and other fields) and want to import it using the new FileInput widget. I don't have sufficient knowledge to manipulate the "base64" strings coming from FileInput to pass it to a ColumnDataSource of dataframe.
from bokeh.io import curdoc
from bokeh.models.widgets import FileInput
def update_cds(attr, old, new):
#A code here to extract column names and data
#from file_input and pass it to a ColumnDataSource or a DataFrame
file_input = FileInput(accept=".csv")
file_input.on_change('value', update_cds)
doc=curdoc()
doc.add_root(file_input)
Thanks for your help!

Here is a working solution: the code will upload the csv file on the server side in a 'data' folder (to be created before). Then it is easy to open the csv and pass it to a ColumnDataSource for instance.
#widget
file_input = FileInput(accept=".csv")
def upload_csv_to_server(attr, old, new):
#decode base64 format (from python base24 website)
base64_message = file_input.value
base64_bytes = base64_message.encode('ascii')
message_bytes = base64.b64decode(base64_bytes)
message = message_bytes.decode('ascii')
#convert string to csv and save it on the server side
message_list = message.splitlines()
with open('data/' + file_input.filename, 'w', newline='') as file:
writer = csv.writer(file)
for i in range(len(message_list)):
writer.writerow(message_list[i].split(','))
file_input.on_change('value', upload_csv_to_server)
If you see a nicer way please let me know. It is easy with the csv structure to do that way but what about any other file format?

Python has a built in base64 standard library module:
import base64
data = base64.b64decode(encoded)

When using Pandas to_hdf is it possible to specify a column data type to vlen special_dtype / vlarray for ragged tensors?

I have a Pandas column which contains numpy arrays or lists of varying size. If I try to convert the dataframe to hdf5 using to_hdf , I get the message that says
PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values]
I am guessing this because of the ragged tensors in pandas column. HDpy does have a special datatype for ragged tensors.
http://docs.h5py.org/en/stable/special.html#arbitrary-vlen-data
Example here
h5f = h5py.File('data.h5', 'w')
dt = h5py.special_dtype(vlen=np.dtype('int32'))
h5f.create_dataset('batch', data=yourData, dtype=dt, compression='gzip', compression_opts=9)
So I can convert the pandas df to numpy, and then save each numpy array separately, with the varying length column stored with the special vlen datatype.
I am wondering if there is a way to do this in Pandas.
The following is a minimal example using a small chunk of my data. It downloads and opens a small chunk of the dataframe, and saves it to hdf5
import requests
import pickle
import numpy as np
import pandas as pd
#Download function for google drive
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#download the google drive file
download_file_from_google_drive('1-0R28Yhdrq2QWQ-4MXHIZUdZG2WZK2qR', 'sample.pkl')
sampleDF2 = pd.read_pickle('sample.pkl')
sampleDF2.to_hdf( 'pandasList.hdf', 'first', complevel = 9 )
sampleDF2['totalCites2'] = sampleDF2['totalCites2'].apply(lambda x: np.array(x))
sampleDF2.to_hdf( 'pandasNumpy.hdf', 'first', complevel = 9 )
For convenience, here is a colab notebook which has this code
https://colab.research.google.com/drive/1DjiPsN3MbRWP6NnJwvaAhzy66FNbPVA8
Edit:
As hpualj mentioned, Pandas uses Pytables not h5py, so it looks like the question should be how to use vlarray, which is how pytables store variable length arrays.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Arranging data into lists from url - urllib2

You can use the requests library of Python 3. import requests data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data").text for line in data.split('\n'): row = line.strip().split(",") xList.append(row)

Related

choropleth plotly map displaying a white background

How to find the code that generates graphs?

Combining CSV of different shapes into one CSV

Bokeh Server: import .csv file with FileInput widget and pass it to ColumnDataSource

When using Pandas to_hdf is it possible to specify a column data type to vlen special_dtype / vlarray for ragged tensors?

Categories

Resources