Error in Loading nested and repeated data to bigquery? - pandas

I am getting JSON response from a API.
I want to get 5 columns from that,from which 4 are normal,but 1 column in RECORD REPEATED type.
I want to load that data to Bigquery table.
Below is my code in which Schema is mentioned.
import requests
from requests.auth import HTTPBasicAuth
import json
from google.cloud import bigquery
import pandas
import pandas_gbq
URL='<API>'
auth = HTTPBasicAuth('username', 'password')
# sending get request and saving the response as response object
r = requests.get(url=URL ,auth=auth)
data = r.json()
----------------------json repsonse----------------
{
"data": {
"id": "jfp695q8",
"origin": "taste",
"title": "Christmas pudding martini recipe",
"subtitle": null,
"customTitles": [{
"name": "editorial",
"value": "Christmas pudding martini"
}]
}
}
id=data['data']['id']
origin=data['data']['origin']
title=data['data']['title']
subtitle=data['data']['subtitle']
customTitles=json.dumps(data['data']['customTitles'])
# print(customTitles)
df = pandas.DataFrame(
{
'id':id,
'origin':origin,
'title':title,
'subtitle':'subtitle',
'customTitles':customTitles
},index=[0]
)
# df.head()
client = bigquery.Client(project='ncau-data-newsquery-sit')
table_id = 'sdm_adpoint.testfapi'
job_config = bigquery.LoadJobConfig(
schema=[
bigquery.SchemaField("id", "STRING"),
bigquery.SchemaField("origin", "STRING"),
bigquery.SchemaField("title", "STRING"),
bigquery.SchemaField("subtitle", "STRING"),
bigquery.SchemaField(
"customTitles",
"RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
bigquery.SchemaField("value", "STRING", mode="NULLABLE"),
])
],
autodetect=False
)
df.head()
job = client.load_table_from_dataframe(
df, table_id, job_config=job_config
)
job.result()
customeTitle is RECORD REPEATED fiels, which has two keys name and values, so I have made schema like that.
Below is my table schema.
Below is output of df.head()
jfp695q8 taste Christmas pudding martini recipe subtitle [{"name": "editorial", "value": "Christmas pudding martini"}]
Till here its good.
But ,when I try to load the data to table it throws below error.
ArrowTypeError: Could not convert '[' with type str: was expecting tuple of (key, value) pair
Can anyone tell me whats wrong here?

Related

Json loading error using Apache beam on Vertex AI

I am trying to load data from datastore to bigquery using Apache beam in Vertex AI notebook. This is the code part where the loading happens-
from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
import apache_beam as beam
from apache_beam.io.gcp.bigquery_file_loads import BigQueryBatchFileLoads
table_row = (p
| 'DatastoreGetData' >> ReadFromDatastore(query=myquery)
| 'EntityConversion' >> beam.Map(ent_to_json_func, table_schema)
| 'Final' >> BigQueryBatchFileLoads(
destination=lambda row: f"myproject:dataset.mytable",
custom_gcs_temp_location=f'gs://myproject/beam',
write_disposition='WRITE_TRUNCATE',
schema=table_schema
)
)
table_schema is the json version of BigQuery table schema (attached Column mapping pic below).
ent_to_json_func converts fields coming from datastore to corresponding BigQuery field in the correct format.
I am trying to load just one row from datastore, it is giving error. The data looks like this-
{ "key": { "namespace": null, "app": null, "path":
"Table/12345678", "kind": "Mykind", "name": null, "id":
12345678 }, "col1": false, "col2": { "namespace": null,
"app": null, "path": "abc/12345", "kind": "abc", "name":
null, "id": 12345 }, "col3": "6835218432", "col4": {
"namespace": null, "app": null, "path": null, "kind":
null, "name": null, "id": null }, "col5": false,
"col6": null, "col7": "https://www.somewebsite.com/poi/",
"col8": "0.00", "col9": "2022-03-12 03:44:17.732193+00:00",
"col10":
"{"someid":"NAME","col7":"https://www.somewebsite.com/poi/", "provided":"Yes","someid2":"SDFTYI1090"}",
"col11": ""0.00"", "col12": "{}", "col13": [] }
The column mapping is here
The error is as follows-
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~/apache-beam-2.41.0/packages/beam/sdks/python/apache_beam/runners/common.py in process(self, windowed_value)
1416 try:
-> 1417 return self.do_fn_invoker.invoke_process(windowed_value)
1418 except BaseException as exn:
~/apache-beam-2.41.0/packages/beam/sdks/python/apache_beam/runners/common.py in invoke_process(self, windowed_value, restriction, watermark_estimator_state, additional_args, additional_kwargs)
837 self._invoke_process_per_window(
--> 838 windowed_value, additional_args, additional_kwargs)
839 return residuals
~/apache-beam-2.41.0/packages/beam/sdks/python/apache_beam/runners/common.py in _invoke_process_per_window(self, windowed_value, additional_args, additional_kwargs)
982 windowed_value,
--> 983 self.process_method(*args_for_process, **kwargs_for_process),
984 self.threadsafe_watermark_estimator)
~/apache-beam-2.41.0/packages/beam/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py in process(self, element, dest_ids_list)
753 # max_retries to 0.
--> 754 self.bq_wrapper.wait_for_bq_job(ref, sleep_duration_sec=10, max_retries=0)
755
~/apache-beam-2.41.0/packages/beam/sdks/python/apache_beam/io/gcp/bigquery_tools.py in wait_for_bq_job(self, job_reference, sleep_duration_sec, max_retries)
637 'BigQuery job {} failed. Error Result: {}'.format(
--> 638 job_reference.jobId, job.status.errorResult))
639 elif job.status.state == 'DONE':
RuntimeError: BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_187_4cab298bbd73af86496c64ca35602a05_a5309204fb004ae0ba8007ac2169e079 failed.
Error Result: <ErrorProto
location: 'gs://myproject/beam/bq_load/63e94c1a210742aabab09f96/myproject.dataset.mytable/aed960fb-0ae6-489a-9cb8-e012eee0d9c8'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details.
File: gs://myproject/beam/bq_load/63e94c1a210742aabab09f96/myproject.dataset.mytable/aed960fb-0ae6-489a-9cb8-e012eee0d9c8'
reason: 'invalid'>
Please let me know what to do. How to determine the exact cause of the error? I have also validated this json through a json validator, there is no issue.
UPDATE:
I found out that the issue is due to BYTES column. From datastore, bytes type is coming, which I am converting to string using decode and saving in json. When I upload that json into BigQuery, it gives error.
How to proceed in this case?
To solve your issue you have to set STRING type for COL10 COL11 COL12 and COL13 in the BigQuery table.
Your final Dict in the PCollection need to match exactly the schema of the BigQuery table.
In your json I saw these columns as String, your schema needs also to have STRING type for these columns.
The error message is accurate. The file attempted to load to bigquery is not valid json string, see "{"someid":"NAME","col7":"https://www.somewebsite.com/poi/" there are quotes without escape.
Load to bytes type should actually be fine.

Is there a way to match avro schema with Bigquery and Bigtable?

I'd like to import bigquery data to bigtable using Google Composer.
Exporting bigquery rows in Avro format to GCS was successful. However, import Avro data to Bigtable was not.
The error says
Caused by: org.apache.avro.AvroTypeException: Found Root, expecting com.google.cloud.teleport.bigtable.BigtableRow, missing required field key
I guess the schema between bigquery and bigtable should match each other. But I have no idea how to do this.
For every record read from the Avro files:
Attributes present in the files and in the table are loaded into the table.
Attributes present in the file but not in the table are subject to ignore_unknown_fields,
Attributes that exist in the table but not in the file will use their default value, if there is one set.
The below links are helpful.
[1] https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#cloud-storage-avro-to-bigtable
[2] https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/master/src/main/resources/schema/avro/bigtable.avsc
[3] Avro to BigTable - Schema issue?
For those of you who still have problem like me because they are not familiar with avro, here is one working schema transformation that I found after some tinkering.
For example, if you have table from bigquery like this
And you want to use user_id as the bigtable row_key and ingest all columns, here is the example code to encode them as avro file.
from avro.schema import Parse
from avro.io import DatumWriter
from avro.datafile import DataFileWriter
bigtable_schema = {
"name" : "BigtableRow",
"type" : "record",
"namespace" : "com.google.cloud.teleport.bigtable",
"fields" : [
{ "name" : "key", "type" : "bytes"},
{ "name" : "cells",
"type" : {
"type" : "array",
"items": {
"name": "BigtableCell",
"type": "record",
"fields": [
{ "name" : "family", "type" : "string"},
{ "name" : "qualifier", "type" : "bytes"},
{ "name" : "timestamp", "type" : "long", "logicalType" : "timestamp-micros"},
{ "name" : "value", "type" : "bytes"}
]
}
}
}
]
}
parsed_schema = Parse(json.dumps(bigtable_schema))
row_key = 'user_id'
family_name = 'feature_name'
feature_list = ['channel', 'zip_code', 'history']
with open('features.avro', 'wb') as f:
writer = DataFileWriter(f, DatumWriter(), parsed_schema)
for item in df.iterrows():
row = item[1]
ts = int(datetime.now().timestamp()) * 1000 * 1000
for feat in feature_list:
writer.append({
"key": row[row_key].encode('utf-8'),
"cells": [{"family": family_name,
"qualifier": feat.encode('utf-8'),
"timestamp": ts,
"value": str(row[feat]).encode('utf-8')}]
})
writer.close()
Then you can use dataflow template job to run the ingestion.
Complete code can be found here: https://github.com/mitbal/sidu/blob/master/bigquery_to_bigtable.ipynb

How to load a jsonl file into BigQuery when the file has mix data fields as columns

During my work flow, after extracting the data from API, the JSON has the following structure:
[
{
"fields":
[
{
"meta": {
"app_type": "ios"
},
"name": "app_id",
"value": 100
},
{
"meta": {},
"name": "country",
"value": "AE"
},
{
"meta": {
"name": "Top"
},
"name": "position",
"value": 1
}
],
"metrics": {
"click": 1,
"price": 1,
"count": 1
}
}
]
Then it is store as .jsonl and put on GCS. However, when I load it onto BigQuery for further extraction, the automatic schema inference return the following error:
Error while reading data, error message: JSON parsing error in row starting at position 0: Could not convert value to string. Field: value; Value: 100
I want to convert it in to the following structure:
app_type
app_id
country
position
click
price
count
ios
100
AE
Top
1
1
1
Is there a way to define manual schema on BigQuery to achieve this result? Or do I have to preprocess the jsonl file before put it to BigQuery?
One of the limitations in loading JSON data from GCS to BigQuery is that it does not support maps or dictionaries in JSON.
A invalid example would be:
"metrics": {
"click": 1,
"price": 1,
"count": 1
}
Your jsonl file should be something like this:
{"app_type":"ios","app_id":"100","country":"AE","position":"Top","click":"1","price":"1","count":"1"}
I already tested it and it works fine.
So wherever you process the conversion of the json files to jsonl files and storage to GCS, you will have to do some preprocessing.
Probably you have to options:
precreate target table with an app_id field as an INTEGER
preprocess jsonfile and enclose 100 into quotes like "100"

How to export pandas data to elasticsearch?

It is possible to export a pandas dataframe data to elasticsearch using elasticsearch-py. For example, here is some code:
https://www.analyticsvidhya.com/blog/2017/05/beginners-guide-to-data-exploration-using-elastic-search-and-kibana/
There are a lot of similar methods like to_excel, to_csv, to_sql.
Is there a to_elastic method? If no, where should I request it?
The following script works for localhost:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
INDEX="dataframe"
TYPE= "record"
def rec_to_actions(df):
import json
for record in df.to_dict(orient="records"):
yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}'% (INDEX, TYPE))
yield (json.dumps(record, default=int))
from elasticsearch import Elasticsearch
e = Elasticsearch() # no args, connect to localhost:9200
if not e.indices.exists(INDEX):
raise RuntimeError('index does not exists, use `curl -X PUT "localhost:9200/%s"` and try again'%INDEX)
r = e.bulk(rec_to_actions(df)) # return a dict
print(not r["errors"])
Verify using curl -g 'http://localhost:9200/dataframe/_search?q=A:[29%20TO%2039]'
There are many little things that can be added to suit different needs but main is there.
I'm not aware of any to_elastic method integrated in pandas. You can always raise an issue on the pandas github repo or create a pull request.
However, there is espandas which allows to import a pandas DataFrame to elasticsearch. The following example from the README has been tested with Elasticsearch 6.2.1.
import pandas as pd
import numpy as np
from espandas import Espandas
df = (100 * pd.DataFrame(np.round(np.random.rand(100, 5), 2))).astype(int)
df.columns = ['A', 'B', 'C', 'D', 'E']
df['indexId'] = (df.index + 100).astype(str)
INDEX = 'foo_index'
TYPE = 'bar_type'
esp = Espandas()
esp.es_write(df, INDEX, TYPE)
Retrieving the mappings with GET foo_index/_mappings:
{
"foo_index": {
"mappings": {
"bar_type": {
"properties": {
"A": {
"type": "long"
},
"B": {
"type": "long"
},
"C": {
"type": "long"
},
"D": {
"type": "long"
},
"E": {
"type": "long"
},
"indexId": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
may you can use
pip install es_pandas
pip install progressbar2
This package should work on Python3(>=3.4) and ElasticSearch should be version 5.x, 6.x or 7.x.
import time
import pandas as pd
from es_pandas import es_pandas
# Information of es cluseter
es_host = 'localhost:9200'
index = 'demo'
# crete es_pandas instance
ep = es_pandas(es_host)
# Example data frame
df = pd.DataFrame({'Alpha': [chr(i) for i in range(97, 128)],
'Num': [x for x in range(31)],
'Date': pd.date_range(start='2019/01/01', end='2019/01/31')})
# init template if you want
doc_type = 'demo'
ep.init_es_tmpl(df, doc_type)
# Example of write data to es, use the template you create
ep.to_es(df, index, doc_type=doc_type)
# set use_index=True if you want to use DataFrame index as records' _id
ep.to_es(df, index, doc_type=doc_type, use_index=True)
here is the document https://pypi.org/project/es-pandas/
if 'es_pandas' cann't solve you problem,you could see other solution : https://towardsdatascience.com/exporting-pandas-data-to-elasticsearch-724aa4dd8f62
You could use elasticsearch-py or if you won't use elasticsearch-py you may find answer to your question here => index-a-pandas-dataframe-into-elasticsearch-without-elasticsearch-py

Nested pymongo queries (mlab)

I have some documents in mlab mongodb; the format is:
{
"_id": {
"$oid": "58aeb1d074fece33edf2b356"
},
"sensordata": {
"operation": "chgstatus",
"user": {
"status": "0",
"uniqueid": "191b117fcf5c"
}
},
"created_date": {
"$date": "2017-02-23T15:26:29.840Z"
}
}
database name : mparking_sensor
collection name : sensor
I want to query in python to extract status key value pair and created_date key value pair only.
my python code is :
import sys
import pymongo
uri = 'mongodb://thorburn:tekush1!#ds157529.mlab.com:57529/mparking_sensor'
client = pymongo.MongoClient(uri)
db = client.get_default_database().sensor
print db
results = db.find()
for record in results:
print(record["sensordata"] , record['created_date'])
print()
client.close()
which gives me everything under sensordata as expected, dot notations giving me an error, can somebody help?
PyMongo represents BSON documents as Python dictionaries, and subdocuments as dictionaries within dictionaries. To access a value in a nested dictionary:
record["sensordata"]["user"]["status"]
So a complete print statement might be:
print("%s %s" % (record["sensordata"]["user"]["status"], record['created_date']))
That prints:
0 {'$date': '2017-02-23T15:26:29.840Z'}