Convert pandas dataframe to datasetDict - pandas

I cannot find anywhere how to convert a pandas dataframe to type datasets.dataset_dict.DatasetDict, for optimal use in a BERT workflow with a huggingface model. Take these simple dataframes, for example.
train_df = pd.DataFrame({
"label" : [1, 2, 3],
"text" : ["apple", "pear", "strawberry"]
})
test_df = pd.DataFrame({
"label" : [2, 2, 1],
"text" : ["banana", "pear", "apple"]
})
What is the most efficient way to convert these to the type above?

One possibility is to first create two Datasets and then join them:
import datasets
import pandas as pd
train_df = pd.DataFrame({
"label" : [1, 2, 3],
"text" : ["apple", "pear", "strawberry"]
})
test_df = pd.DataFrame({
"label" : [2, 2, 1],
"text" : ["banana", "pear", "apple"]
})
train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)
my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})
The result is:
DatasetDict({
train: Dataset({
features: ['label', 'text'],
num_rows: 3
})
test: Dataset({
features: ['label', 'text'],
num_rows: 3
})
})

Related

Default display of annotations in Plotly/Python using a button

I managed the possibility to switch on and off the annotations in a plotly chart. After the executing the code i would like to see the chart without annotations and if required, the annotations could be activated (Label: OFF).
The following code displays per default the annotations and i am not able to set it up, that the first display of chart will be show without annotations.
import plotly.express as px
import pandas as pd
# assign data of lists.
data = {'x': ["2022-05-06", "2022-05-08", "2022-05-09", "2022-05-12", "2022-05-15", "2022-05-16", "2022-05-22", "2022-05-24", "2022-05-26"],
'y': [0, 1, 8, 2, 4, 3, 4, 6, 5],
'text':["","","Annotation1","","Annotation2","","","","Annotation3"]}
# Create DataFrame
df = pd.DataFrame(data)
# assign data of lists.
data1 = {'x': ["2022-05-07", "2022-05-14", "2022-05-23"],
'text':["Annotation1", "Annotation2", "Annotation3"]}
df1 = pd.DataFrame(data1)
fig = px.line(df, x='x', y='y', title='Annotations ONN / OFF')
arrow_list=[]
counter=0
for i in df1['text'].tolist():
if i != "":
arrow=dict(x=df1['x'].values[counter],y=9,xref="x",yref="y",text=i,arrowhead = 2,ax=0,
arrowwidth=1.5,
bordercolor="#c7c7c7",
borderwidth=2,
borderpad=4,
bgcolor="#ff7f0e",
opacity=0.8,
font=dict(
family="Courier New, monospace",
size=16,
color="#ffffff"
),
arrowcolor='rgb(255,51,0)',)
arrow_list.append(arrow)
counter+=1
else:
counter+=1
fig.update_layout(
updatemenus=[
dict(
type="buttons",
# direction="right",
active=0,
showactive=True,
buttons=list([
dict(label="Label:Off",
method="update",
args=[{"visible": [True, False, True, False]},
{"annotations": []}]),
dict(label="Label:On",
method="update",
args=[{"visible": [True, True, True, True]},
{"annotations": arrow_list}]),
]),
)
])
fig.update_layout(annotations=arrow_list)
fig.show()
I have tried to modify args and active, but without any success.
The first display of the charts (annotations should be off):
I found the solution:
import plotly.graph_objects as go
import pandas as pd
# Load dataset
# assign data of lists.
data = {'Date': ["2022-05-06", "2022-05-08", "2022-05-09", "2022-05-12", "2022-05-15", "2022-05-16", "2022-05-22", "2022-05-24", "2022-05-26"],
'High': [0, 1, 8, 2, 4, 3, 4, 6, 5],
'text':["","","Annotation1","","Annotation2","","","","Annotation3"]}
# Create DataFrame
df = pd.DataFrame(data)
# Initialize figure
fig = go.Figure()
# Add Traces
fig.add_trace(
go.Scatter(x=list(df.Date),
y=list(df.High),
name="High",
line=dict(color="#33CFA5")))
arrow_list=[]
counter=0
for i in df1['text'].tolist():
if i != "":
arrow=dict(x=df1['x'].values[counter],y=9,xref="x",yref="y",text=i,arrowhead = 2,ax=0,
arrowwidth=1.5,
bordercolor="#c7c7c7",
borderwidth=2,
borderpad=4,
bgcolor="#ff7f0e",
opacity=0.8,
font=dict(
family="Courier New, monospace",
size=16,
color="#ffffff"
),
arrowcolor='rgb(255,51,0)',)
arrow_list.append(arrow)
counter+=1
else:
counter+=1
fig.update_layout(
updatemenus=[
dict(
active=0,
buttons=list([
dict(label="Label: Off",
method="update",
args=[{"visible": [True, False, True, False]},
{"title": "Labels Off",
"annotations": []}]),
dict(label="Label: On",
method="update",
args=[{"visible": [True, True, False, False]},
{"title": "Labels On",
"annotations": arrow_list}]),
]),
)
])
fig.show()

How to change pandas display font of index column

data = {
'X': [3, 2, 0, 1],
'Y': [0, 3, 7, 2]
}
df = pd.DataFrame(data, index=['A', 'B', 'C', 'D'])
df.style.set_properties(**{
'font-family':'Courier New'
})
df
The index column is displayed in bold, is it possible to change font of index column?
You must use table_styles. In this example I manage to make the "font-weight":"normal" for the index and columns:
Let's define some test data:
import pandas as pd
df = pd.DataFrame({'A':[1,2,3,4],
'B':[5,4,3,1]})
We define style customization to use:
styles = [
dict(selector="th", props=[("font-weight","normal"),
("text-align", "center")])]
We pass the style variable as the argument for set_table_styles():
html = (df.style.set_table_styles(styles))
html
And the output is:
Please feel free to read about the documentation in pandas Styling for more details.

order plotly legend by custom order

I have a plotly figure to which I add values from two dataframes, df_a and df_b. I display both dataframes in different subplots, but they share a legend. How can I order the shared legend? The expected order is: [a, b, c, f, g]. Please see below for the current implementation - it seems to pick the order from input data in some way.
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
buckets = ["a", "b", "c", "f", "g"]
fig_subplots = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=["df_a", "df_b"])
def get_chart(df, buckets, d_legend):
fig = go.Figure()
colorscale = px.colors.qualitative.Pastel
i_color = 0
unique_dates = [str(dt) for dt in df.date.unique()]
for bucket in sorted(buckets, reverse=True):
y_values = df[df['variable'] == bucket]["value"].to_list()
enable_legend_for_bucket = False if bucket in d_legend else True
fig.add_trace(go.Bar(
name=bucket,
x=unique_dates,
y=y_values,
marker_color=colorscale[i_color],
legendgroup=bucket,
showlegend=enable_legend_for_bucket
))
if len(y_values) != 0:
d_legend[bucket] = True # store first time this bucket was populated for legend
i_color += 1
fig.update_layout(barmode="stack")
return fig, d_legend
list_df = [df_a.melt(id_vars="date"), df_b.melt(id_vars="date")]
d_legend = {}
iRow = 1
for df in list_df:
fig, d_legend = get_chart(df, buckets, d_legend)
for el in fig['data']:
fig_subplots.append_trace(el, iRow, 1)
iRow += 1
fig_subplots.update_layout(barmode='stack', legend={'traceorder':'normal'})
Without finding an easy built-in method to order the legend, and to have the same stacking of values with the legend sorted in any desired order.
new_order = sorted(['a','b','c','g','f'],reverse=True)
print(new_order)
ordered_object_list =[]
for i in new_order:
item = [obj for obj in fig_subplots.data if obj['name'] == i]
ordered_object_list += item
fig_subplots.data = ordered_object_list
fig_subplots.update_layout(legend={'traceorder':'reversed'})
There is also an approach with plotly express:
import pandas as pd
import plotly.express as px
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_a["id"] = "df_a"
df_b["id"] = "df_b"
df_c = df_a.append(df_b)
px.bar(df_c.melt(id_vars=["date", "id"]), x="date", y="value", color="variable", facet_col="id", range_y=[0,20], facet_col_wrap=1)

Can you change the caption font size using Pandas styling?

I am trying to change the font size of a caption using the pandas styling API. Is this possible?
Here is what I have so far:
import pandas as pd
data = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=data)
df.style.set_caption("Some Caption")
Appreciate any input.
Try this:
df.style.set_caption("Some Caption").set_table_styles([{
'selector': 'caption',
'props': [
('color', 'red'),
('font-size', '16px')
]
}])

Pandas input function with multiple value sparse categorical data

Given a pandas dataframe
df = pd.DataFrame([
[1, ["a", "b"], 10],
[2, ["b"], 20],
], columns= ["a", "b", "label"])
Where a column "b" is a list of values, representing sparse categorical data, how can I create an input function to feed to estimator in train and predict?
Using padas_input_fn it does not work, because of the b column:
train_fn = tf.estimator.inputs.pandas_input_fn(x=df[["a", "b"]], y=df.label, shuffle=True)
-- Error --
tensorflow.python.framework.errors_impl.InternalError: Unable to get element as bytes.
I can create a tfrecords file, write the data using BytesList for column b, and read it using TFRecordDataset, than with a parse func to parse column b using a varLenFeature, it works.
But how can I feed this data into estimator using an in memory object/dataframe and/or pandas input fn?
Below is my all code:
import tensorflow as tf
import pandas as pd
from tensorflow.estimator.inputs import pandas_input_fn
from tensorflow.estimator import DNNRegressor
from tensorflow.feature_column import numeric_column, indicator_column, categorical_column_with_vocabulary_list
from tensorflow.train import Feature, Features, BytesList, FloatList, Example
from tensorflow.python_io import TFRecordWriter
df = pd.DataFrame([
[1, ["a", "b"], 10],
[2, ["b"], 20],
], columns= ["a", "b", "label"])
writer = TFRecordWriter("test.tfrecord")
for row in df.iterrows():
dict_feature = {}
label_values = []
for e in row[1].iteritems():
if e[0] =="a":
dict_feature.update({e[0]: Feature(float_list=FloatList(value=[e[1]]))})
elif e[0] == "b":
dict_feature.update({e[0]: Feature(bytes_list=BytesList(value=[m.encode('utf-8') for m in e[1]]))})
elif e[0] == "label":
dict_feature.update({e[0]: Feature(float_list=FloatList(value=[e[1]]))})
example = Example(features=Features(feature=dict_feature))
writer.write(example.SerializeToString())
writer.close()
def parse_tfrecords(example_proto):
feature_description = {}
feature_description.update({"a": tf.FixedLenFeature(shape=[], dtype=tf.float32)})
feature_description.update({"b": tf.VarLenFeature(dtype=tf.string)})
feature_description.update({"label": tf.FixedLenFeature(shape=[], dtype=tf.float32)})
parsed_features = tf.parse_single_example(example_proto, feature_description)
features = { key: parsed_features[key] for key in ["a", "b"] }
label = parsed_features["label"]
return features, label
def tf_record_input_fn(filenames_pattern):
def _input_fn():
dataset = tf.data.TFRecordDataset(filenames=filenames_pattern)
dataset = dataset.shuffle(buffer_size=128)
dataset = dataset.map(map_func=parse_tfrecords)
dataset = dataset.batch(batch_size=128)
return dataset
return _input_fn
feature_columns = [
numeric_column("a"),
indicator_column(categorical_column_with_vocabulary_list("b", vocabulary_list=['a', 'b']))
]
estimator = DNNRegressor(feature_columns=feature_columns, hidden_units=[1])
train_input_fn = tf_record_input_fn("test.tfrecord")
# Next line does not work
# train_input_fn = tf.estimator.inputs.pandas_input_fn(x=df[["a", "b"]], y=df.label, shuffle=True)
estimator.train(train_input_fn)
I do not have a complete solution for your query because of my lack of experience with tensorflow.estimator API, but is it possible for you to reshape your dataframe instead? If the values in lists of column b are categorical in nature, maybe you can try one-hot encoding them and in that process add more columns to df? That way, your df will become process-able to all estimators in general.