I'm trying to ingest a csv file into bigquery using apache beam and dataflow here's my code:
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
gcs_bucket_name = "gs://bck-fr-fichiers-manuel-dev/de_par_categorie_et_code_rome/"
target_table_annonce = 'fr-parisraw-dev-8ef8:pole_emploi.de_par_categorie_et_code_rome'
table_schema_annonce = {'fields': [
{'name': 'cd_metier_rome', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'lb_metier_rome', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'cd_departement', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'lb_departement', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'nb_demandeur', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'mois', 'type': 'STRING', 'mode': 'NULLABLE'}
]}
# DoFn
class PrepareBqRowDoFn(beam.DoFn):
def process(self, element, *args, **kwargs):
logging.basicConfig(level=logging.INFO)
DOFN_LOGGER = logging.getLogger("PREPAREBQROWDOFN_LOGGER")
import csv
from datetime import datetime, timedelta
import re
# element = re.sub(r'(?=[^"]+)¤(?=[^"]+)', '', element)
line = csv.reader(element.splitlines(), quotechar='"',
delimiter=';',quoting=csv.QUOTE_ALL, skipinitialspace=True)
for row in line:
try:
bq_row = {"cd_metier_rome": row[0],
"lb_metier_rome": row[1],
"cd_departement": row[2],
"lb_departement": row[3],
"nb_demandeur": row[4],
"mois": row[5]
}
yield bq_row
except IndexError:
DOFN_LOGGER.info("Error Row : " + element)
def run():
pipeline = beam.Pipeline(options=PipelineOptions())
file_patterns = ['de_par_*.csv']
for file_pattern in file_patterns:
csv_lines = pipeline | 'Read File From GCS {}'.format(file_pattern) >> beam.io.ReadFromText(
gcs_bucket_name + file_pattern)
bq_row = csv_lines | 'Create Row {}'.format(file_pattern) >> beam.ParDo(PrepareBqRowDoFn())
bq_row | 'Write to BQ {}'.format(file_pattern) >> beam.io.Write(beam.io.WriteToBigQuery(
target_table_annonce,
schema=table_schema_annonce,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
pipeline.run()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
The pipeline generated looks like this :
Each step I can see the the rows are being treated by Dataflow :
Step 1 (Read File From GCS de_par_*.csv) :
Step 2 (Create Row de_par_*.csv) :
But the final step 3 (Write to BQ de_par_*.csv) :
I get 0 lines
Related
I'm trying to run the following code in Jupyter notebook - but it just keeps running endlessly with no output. I'm following the tutorial from: https://data-flair.training/blogs/stock-price-prediction-machine-learning-project-in-python/
The code is from the stock_app.py which doesn't seem to be working:
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
import plotly.graph_objs as go
from dash.dependencies import Input, Output
from keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
import numpy as np
app = dash.Dash()
server = app.server
scaler=MinMaxScaler(feature_range=(0,1))
df_nse = pd.read_csv("./NSE-TATA.csv")
df_nse["Date"]=pd.to_datetime(df_nse.Date,format="%Y-%m-%d")
df_nse.index=df_nse['Date']
data=df_nse.sort_index(ascending=True,axis=0)
new_data=pd.DataFrame(index=range(0,len(df_nse)),columns=['Date','Close'])
for i in range(0,len(data)):
new_data["Date"][i]=data['Date'][i]
new_data["Close"][i]=data["Close"][i]
new_data.index=new_data.Date
new_data.drop("Date",axis=1,inplace=True)
dataset=new_data.values
train=dataset[0:987,:]
valid=dataset[987:,:]
scaler=MinMaxScaler(feature_range=(0,1))
scaled_data=scaler.fit_transform(dataset)
x_train,y_train=[],[]
for i in range(60,len(train)):
x_train.append(scaled_data[i-60:i,0])
y_train.append(scaled_data[i,0])
x_train,y_train=np.array(x_train),np.array(y_train)
x_train=np.reshape(x_train,(x_train.shape[0],x_train.shape[1],1))
model=load_model("saved_ltsm_model.h5")
inputs=new_data[len(new_data)-len(valid)-60:].values
inputs=inputs.reshape(-1,1)
inputs=scaler.transform(inputs)
X_test=[]
for i in range(60,inputs.shape[0]):
X_test.append(inputs[i-60:i,0])
X_test=np.array(X_test)
X_test=np.reshape(X_test,(X_test.shape[0],X_test.shape[1],1))
closing_price=model.predict(X_test)
closing_price=scaler.inverse_transform(closing_price)
train=new_data[:987]
valid=new_data[987:]
valid['Predictions']=closing_price
df= pd.read_csv("./stock_data.csv")
app.layout = html.Div([
html.H1("Stock Price Analysis Dashboard", style={"textAlign": "center"}),
dcc.Tabs(id="tabs", children=[
dcc.Tab(label='NSE-TATAGLOBAL Stock Data',children=[
html.Div([
html.H2("Actual closing price",style={"textAlign": "center"}),
dcc.Graph(
id="Actual Data",
figure={
"data":[
go.Scatter(
x=train.index,
y=valid["Close"],
mode='markers'
)
],
"layout":go.Layout(
title='scatter plot',
xaxis={'title':'Date'},
yaxis={'title':'Closing Rate'}
)
}
),
html.H2("LSTM Predicted closing price",style={"textAlign": "center"}),
dcc.Graph(
id="Predicted Data",
figure={
"data":[
go.Scatter(
x=valid.index,
y=valid["Predictions"],
mode='markers'
)
],
"layout":go.Layout(
title='scatter plot',
xaxis={'title':'Date'},
yaxis={'title':'Closing Rate'}
)
}
)
])
]),
dcc.Tab(label='Facebook Stock Data', children=[
html.Div([
html.H1("Facebook Stocks High vs Lows",
style={'textAlign': 'center'}),
dcc.Dropdown(id='my-dropdown',
options=[{'label': 'Tesla', 'value': 'TSLA'},
{'label': 'Apple','value': 'AAPL'},
{'label': 'Facebook', 'value': 'FB'},
{'label': 'Microsoft','value': 'MSFT'}],
multi=True,value=['FB'],
style={"display": "block", "margin-left": "auto",
"margin-right": "auto", "width": "60%"}),
dcc.Graph(id='highlow'),
html.H1("Facebook Market Volume", style={'textAlign': 'center'}),
dcc.Dropdown(id='my-dropdown2',
options=[{'label': 'Tesla', 'value': 'TSLA'},
{'label': 'Apple','value': 'AAPL'},
{'label': 'Facebook', 'value': 'FB'},
{'label': 'Microsoft','value': 'MSFT'}],
multi=True,value=['FB'],
style={"display": "block", "margin-left": "auto",
"margin-right": "auto", "width": "60%"}),
dcc.Graph(id='volume')
], className="container"),
])
])
])
#app.callback(Output('highlow', 'figure'),
[Input('my-dropdown', 'value')])
def update_graph(selected_dropdown):
dropdown = {"TSLA": "Tesla","AAPL": "Apple","FB": "Facebook","MSFT": "Microsoft",}
trace1 = []
trace2 = []
for stock in selected_dropdown:
trace1.append(
go.Scatter(x=df[df["Stock"] == stock]["Date"],
y=df[df["Stock"] == stock]["High"],
mode='lines', opacity=0.7,
name=f'High {dropdown[stock]}',textposition='bottom center'))
trace2.append(
go.Scatter(x=df[df["Stock"] == stock]["Date"],
y=df[df["Stock"] == stock]["Low"],
mode='lines', opacity=0.6,
name=f'Low {dropdown[stock]}',textposition='bottom center'))
traces = [trace1, trace2]
data = [val for sublist in traces for val in sublist]
figure = {'data': data,
'layout': go.Layout(colorway=["#5E0DAC", '#FF4F00', '#375CB1',
'#FF7400', '#FFF400', '#FF0056'],
height=600,
title=f"High and Low Prices for {', '.join(str(dropdown[i]) for i in selected_dropdown)} Over Time",
xaxis={"title":"Date",
'rangeselector': {'buttons': list([{'count': 1, 'label': '1M',
'step': 'month',
'stepmode': 'backward'},
{'count': 6, 'label': '6M',
'step': 'month',
'stepmode': 'backward'},
{'step': 'all'}])},
'rangeslider': {'visible': True}, 'type': 'date'},
yaxis={"title":"Price (USD)"})}
return figure
#app.callback(Output('volume', 'figure'),
[Input('my-dropdown2', 'value')])
def update_graph(selected_dropdown_value):
dropdown = {"TSLA": "Tesla","AAPL": "Apple","FB": "Facebook","MSFT": "Microsoft",}
trace1 = []
for stock in selected_dropdown_value:
trace1.append(
go.Scatter(x=df[df["Stock"] == stock]["Date"],
y=df[df["Stock"] == stock]["Volume"],
mode='lines', opacity=0.7,
name=f'Volume {dropdown[stock]}', textposition='bottom center'))
traces = [trace1]
data = [val for sublist in traces for val in sublist]
figure = {'data': data,
'layout': go.Layout(colorway=["#5E0DAC", '#FF4F00', '#375CB1',
'#FF7400', '#FFF400', '#FF0056'],
height=600,
title=f"Market Volume for {', '.join(str(dropdown[i]) for i in selected_dropdown_value)} Over Time",
xaxis={"title":"Date",
'rangeselector': {'buttons': list([{'count': 1, 'label': '1M',
'step': 'month',
'stepmode': 'backward'},
{'count': 6, 'label': '6M',
'step': 'month',
'stepmode': 'backward'},
{'step': 'all'}])},
'rangeslider': {'visible': True}, 'type': 'date'},
yaxis={"title":"Transactions Volume"})}
return figure
if __name__=='__main__':
app.run_server(debug=True)
I am trying to format numbers in a Julia Dash DataTable so that they are comma separated. However, I cannot get it to work.
(name = id_name, id = id_name, format = (locale = (grouping = [3])))
I have an example below in python where they use this. Could someone show me a working example?
Source: https://dash.plotly.com/datatable/typing
app.layout = html.Div([
dash_table.DataTable(
id='typing_formatting',
data=df_typing_formatting.to_dict('records'),
columns=[{
'id': 'city',
'name': 'City',
'type': 'text'
}, {
'id': 'max',
'name': u'Max Temperature (˚F)',
'type': 'numeric',
'format': Format(
precision=0,
scheme=Scheme.fixed,
symbol=Symbol.yes,
symbol_suffix=u'˚F'
),
# equivalent manual configuration
# 'format': {
# 'locale': {
# 'symbol': ['', '˚F']
# },
# 'specifier': '$.0f'
# }
}, {
'id': 'max_date',
'name': 'Max Temperature (Date)',
'type': 'datetime'
}, {
'id': 'min',
'name': u'Min Temperature (˚F)',
'type': 'numeric',
'format': Format(
nully='N/A',
precision=0,
scheme=Scheme.fixed,
sign=Sign.parantheses,
symbol=Symbol.yes,
symbol_suffix=u'˚F'
),
# equivalent manual configuration
# 'format': {
# 'locale': {
# 'symbol': ['', '˚F']
# },
# 'nully': 'N/A'
# 'specifier': '($.0f'
# }
'on_change': {
'action': 'coerce',
'failure': 'default'
},
'validation': {
'default': None
}
}, {
'id': 'min_date',
'name': 'Min Temperature (Date)',
'type': 'datetime',
'on_change': {
'action': 'none'
}
}]
Is there a smart pythonic way to parse a nested column in a pandas dataframe like this one to 3 different columns? So for example the column could look like this:
col1
[{'name': 'amount', 'value': 1}, {'name': 'frequency', 'value': 2}, {'name': 'freq_unit', 'value': 'month'}]
[{'name': 'amount', 'value': 3}, {'name': 'frequency', 'value': 1}, {'name': 'freq_unit', 'value': 'month'}]
And the expected result should be these 3 columns:
amount frequency freq_unit
1 2 month
3 1 month
That's just level 1. I have the level 2: What if the elements in the list still have the same names (amount, frequency and freq_unit) but the order could change? Could the code in the answer deal with this?
col1
[{'name': 'amount', 'value': 1}, {'name': 'frequency', 'value': 2}, {'name': 'freq_unit', 'value': 'month'}]
[{'name': 'amount', 'value': 3}, {'name': 'freq_unit', 'value': 'month'}, {'name': 'frequency', 'value': 1}]
Code for reproduce the data. Really look forward to see how the community would solve this. Thank you
data = {'col1':[[{'name': 'amount', 'value': 1}, {'name': 'frequency', 'value': 2}, {'name': 'freq_unit', 'value': 'month'}],
[{'name': 'amount', 'value': 3}, {'name': 'frequency', 'value': 1}, {'name': 'freq_unit', 'value': 'month'}]]}
df = pd.DataFrame(data)
A combination of list comprehension, itertools.chain, and collections.defaultdict could help out here:
from itertools import chain
from collections import defaultdict
data = defaultdict(list)
phase1 = [[(data["name"], data["value"])
for data in entry]
for entry in df.col1
]
phase1 = chain.from_iterable(phase1)
for key, value in phase1:
data[key].append(value)
pd.DataFrame(data)
amount frequency freq_unit
0 1 2 month
1 3 1 month
The above is verbose: #piRSquared's comment is much simpler, with a list comprehension:
pd.DataFrame([{x["name"]: x["value"] for x in lst} for lst in df.col1])
Another idea, but very unnecessary, is to use a list comprehension, combined with Pandas' string methods:
outcome = [(df.col1.str[num].str["value"]
.rename(df.col1.str[num].str["name"][0])
)
for num in range(df.col1.str.len()[0])
]
pd.concat(outcome, axis = 'columns')
#piRsquared's solution is the simplest, in my opinion.
You can write a function that will parse each cell in your Series and return a properly formatted Series and use apply to tuck the iteration away:
>>> def custom_parser(record):
... clean_record = {rec["name"]: rec["value"] for rec in record}
... return pd.Series(clean_record)
>>> df["col1"].apply(custom_parser)
amount frequency freq_unit
0 1 2 month
1 3 1 month
I have a Datafrmae with output as shown below, I am trying to extract specific text
id,value
101,*sample value as shown below*
I am trying to extract the value corresponding to key in this text
Expected output
id, key, id_new
101,Ticket-123, 1001
Given below is how the data looks like:
{
'fields': {
'status': {
'statusCategory': {
'colorName': 'yellow',
'name': 'In Progress',
'key': 'indeterminate',
'id': 4
},
'description': '',
'id': '11000',
'name': 'In Progress'
},
'summary': 'Sample Text'
},
'key': 'Ticket-123',
'id': '1001'
}
Use Series.str.get:
df['key'] = df['value'].str.get('key')
df['id_new'] = df['value'].str.get('id')
print (df)
id value key id_new
0 101 {'fields': {'status': {'statusCategory': {'col... Ticket-123 1001
Tested Dataframe:
v = {
'fields': {
'status': {
'statusCategory': {
'colorName': 'yellow',
'name': 'In Progress',
'key': 'indeterminate',
'id': 4
},
'description': '',
'id': '11000',
'name': 'In Progress'
},
'summary': 'Sample Text'
},
'key': 'Ticket-123',
'id': '1001'
}
df = pd.DataFrame({'id':101, 'value':[v]})
Hello I am trying to insert one row into a table, I succesfully created the table as follows:
schema = [{'name': 'foo', 'type': 'STRING', 'mode': 'nullable'},{'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'}]
created = client.create_table(dataset='api_data_set_course_33', table='insert_test_333', schema=schema)
print('Creation Result ',created)
However when I push the row I got False,
rows = [{'id': 'NzAzYmRiY', 'one': 'uno', 'two': 'dos'}]
inserted = client.push_rows('api_data_set_course_33','insert_test_333', rows, 'id')
print('Insertion Result ',inserted)
So I don't have idea what is wrong, I really would like to appreciate support to overcome this task
This is the API that I am testing:
https://github.com/tylertreat/BigQuery-Python
This is my complete code:
schema = [{'name': 'foo', 'type': 'STRING', 'mode': 'nullable'},{'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'}]
created = client.create_table(dataset='api_data_set_course_33', table='insert_test_333', schema=schema)
print('Creation Result ',created)
rows = [{'id': 'NzAzYmRiY', 'one': 'uno', 'two': 'dos'}]
inserted = client.push_rows('api_data_set_course_33','insert_test_333', rows, 'id')
print('Insertion Result ',inserted)
Output:
Creation Result True
Insertion Result False
After feedback I tried:
>>> client = get_client(project_id, service_account=service_account,private_key_file=key, readonly=False)
>>> schema = [{'name': 'foo', 'type': 'STRING', 'mode': 'nullable'},{'name': 'bar', 'type': 'FLOAT', 'mode': 'nullable'}]
>>> rows = [{'id': 'NzAzYmRiY', 'foo': 'uno', 'bar': 'dos'}]
>>> inserted = client.push_rows('api_data_set_course_33','insert_test_333', rows, 'id')
>>> print(inserted)
False
and also:
>>> rows = [{'id': 'NzAzYmRiY', 'foo': 'uno', 'bar': 45}]
>>> inserted = client.push_rows('api_data_set_course_33','insert_test_333', rows, 'id')
>>> print(inserted)
False
However I only got false
Your row field names don't match your schema field names. Try this instead:
rows = [{'id': 'NzAzYmRiY', 'foo': 'uno', 'bar': 'dos'}]