Create table from Google Sheet with multi line headings - google-bigquery

I have been creating BigQuery tables from Google Sheets via the BQ console GUI. This has been working well, but I am now trying to import a file with multiple heading lines and I don't know how to go about it.
If I try to auto detect schema and skip the first 2 columns I get the error Failed to create table: Duplicate column names: 'Date'.
If I try and mannually create the schema, similar to below. Then the import works but when I go to query the records I get the error Unsupported field type: RECORD
[
{
"name": "SubjectName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "SubjectCode",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "Class1",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "Date",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "Tutor",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "Class2",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "Date",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "Tutor",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "Class3",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "Date",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "Tutor",
"type": "STRING",
"mode": "NULLABLE"
}
]
}
]
Is there any way to import a google sheet with multiple headings via the BQ Console gui?

Related

How can I update records based on conditions on nested fields?

I have a bigquery table with the following schema:
{
"fields": [
{
"name": "products",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "name",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "qty",
"type": "INTEGER",
"mode": "REQUIRED"
},
{
"name": "variant_name",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "order_id",
"type": "INTEGER",
"mode": "REQUIRED"
},
{
"name": "test_mode",
"type": "BOOLEAN",
"mode": "REQUIRED"
},
{
"name": "amount",
"type": "FLOAT",
"mode": "REQUIRED"
},
{
"name": "transaction_reference",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "id",
"type": "INTEGER",
"mode": "REQUIRED"
},
{
"name": "source",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "currency",
"type": "STRING",
"mode": "REQUIRED"
}
]
},
{
"name": "processed_at",
"type": "TIMESTAMP",
"mode": "REQUIRED",
"description": "bq-datetime"
},
{
"name": "inserted_at",
"type": "TIMESTAMP",
"mode": "REQUIRED",
"description": "bq-datetime"
}
]
}
As you can see the products field is a nested one. What I would like to achieve is an update condition like the following:
UPDATE `dataset.table`
SET processed_at = '2021-04-17T16:07:30.993806'
WHERE processed_at = '1970-01-01T00:00:00' AND products.order_id = 9366054;
And whenever I try to do so, I get the following error
Cannot access field order_id on a value with type ARRAY<STRUCT<name STRING, qty INT64, variant_name STRING, ...>>
I know that to SELECT stuff with the same logic, I can use the UNNEST statement, I am not able to apply this to UPDATE.
Try EXISTS:
WHERE processed_at = '1970-01-01T00:00:00'
and EXISTS (SELECT *
FROM UNNEST(products)
WHERE order_id = 9366054
);

Export BigQuery table schema to JSON Schema

It is possible to export a bigquery table schema to a JSON file but the resulting JSON file is a bigquery table schema and not a JSON schema.
I am looking for a way to generate a JSON schema using a bigquery table based on the standard available here: https://json-schema.org/
This looks something like this:
{
"definitions": {},
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "http://example.com/root.json",
"type": "object",
"title": "The Root Schema",
"required": [
"glossary"
],
"properties": {
"glossary": {
"$id": "#/properties/glossary",
"type": "object",
"title": "The Glossary Schema",
"required": [
"title",
"GlossDiv"
],
"properties": {
"title": {
"$id": "#/properties/glossary/properties/title",
"type": "string",
"title": "The Title Schema",
"default": "",
"examples": [
"example glossary"
],
"pattern": "^(.*)$"
},
"GlossDiv": {
"$id": "#/properties/glossary/properties/GlossDiv",
"type": "object",
"title": "The Glossdiv Schema",
"required": [
"title",
"GlossList"
],
"properties": {
"title": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/title",
"type": "string",
"title": "The Title Schema",
"default": "",
"examples": [
"S"
],
"pattern": "^(.*)$"
},
"GlossList": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList",
"type": "object",
"title": "The Glosslist Schema",
"required": [
"GlossEntry"
],
"properties": {
"GlossEntry": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry",
"type": "object",
"title": "The Glossentry Schema",
"required": [
"ID",
"SortAs",
"GlossTerm",
"Acronym",
"Abbrev",
"GlossDef",
"GlossSee"
],
"properties": {
"ID": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/ID",
"type": "string",
"title": "The Id Schema",
"default": "",
"examples": [
"SGML"
],
"pattern": "^(.*)$"
},
"SortAs": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/SortAs",
"type": "string",
"title": "The Sortas Schema",
"default": "",
"examples": [
"SGML"
],
"pattern": "^(.*)$"
},
"GlossTerm": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/GlossTerm",
"type": "string",
"title": "The Glossterm Schema",
"default": "",
"examples": [
"Standard Generalized Markup Language"
],
"pattern": "^(.*)$"
},
"Acronym": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/Acronym",
"type": "string",
"title": "The Acronym Schema",
"default": "",
"examples": [
"SGML"
],
"pattern": "^(.*)$"
},
"Abbrev": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/Abbrev",
"type": "string",
"title": "The Abbrev Schema",
"default": "",
"examples": [
"ISO 8879:1986"
],
"pattern": "^(.*)$"
},
"GlossDef": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/GlossDef",
"type": "object",
"title": "The Glossdef Schema",
"required": [
"para",
"GlossSeeAlso"
],
"properties": {
"para": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/GlossDef/properties/para",
"type": "string",
"title": "The Para Schema",
"default": "",
"examples": [
"A meta-markup language, used to create markup languages such as DocBook."
],
"pattern": "^(.*)$"
},
"GlossSeeAlso": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/GlossDef/properties/GlossSeeAlso",
"type": "array",
"title": "The Glossseealso Schema",
"items": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/GlossDef/properties/GlossSeeAlso/items",
"type": "string",
"title": "The Items Schema",
"default": "",
"examples": [
"GML",
"XML"
],
"pattern": "^(.*)$"
}
}
}
},
"GlossSee": {
"$id": "#/properties/glossary/properties/GlossDiv/properties/GlossList/properties/GlossEntry/properties/GlossSee",
"type": "string",
"title": "The Glosssee Schema",
"default": "",
"examples": [
"markup"
],
"pattern": "^(.*)$"
}
}
}
}
}
}
}
}
}
}
}
BigQuery does not use the json-schema standard for the tables schema. I found two projects that have the code available to go from json-schema to BigQuery schema:
jsonschema-bigquery
jsonschema-transpiler
You could try using those projects as reference to create the opposite transformation. Also, you could create a feature request to the BigQuery team, asking to include the json-schema standard as an output format option.
No this is not possible without writing a program to do so for you.
There is a feature request made by me that requests this functionality.
https://issuetracker.google.com/issues/145308573

hive can't create table with nested avro schema

I'm trying to use a nested avro schema to create a hive table. But it does not work. I'm using hive 1.1 in cdh5.7.2.
here is my nested avro schema:
[
{
"type": "record",
"name": "Id",
"namespace": "com.test.app_list",
"doc": "Device ID",
"fields": [
{
"name": "idType",
"type": "int"
},{
"name": "id",
"type": "string"
}
]
},
{
"type": "record",
"name": "AppList",
"namespace": "com.test.app_list",
"doc": "",
"fields": [
{
"name": "appId",
"type": "string",
"avro.java.string": "String"
},
{
"name": "timestamp",
"type": "long"
},
{
"name": "idList",
"type": [{"type": "array", "items": "com.test.app_list.Id"}]
}
]
}
]
And my sql to create table:
CREATE EXTERNAL TABLE app_list
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.url'='/hive/schema/test_app_list.avsc');
But hive gives me:
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: MetaException(message:org.apache.hadoop.hive.serde2.avro.AvroSerdeException Schema for table must be of type RECORD. Received type: UNION)
hive doc shows: Supports arbitrarily nested schemas. from : https://cwiki.apache.org/confluence/display/Hive/AvroSerDe#AvroSerDe-Overview–WorkingwithAvrofromHive
data sample:
{
"appId":{"string":"com.test.app"},
"timestamp":{"long":1495893601606},
"idList":{
"array":[
{"idType":15,"id":"6c:5c:14:c3:a5:39"},
{"idType":13,"id":"eb297afe56ff340b6bb7de5c5ab09193"}
]
}
}
But I don't know how to. I need some help to fix this. Thanks!
the top level of the your avro schema expect to be a Record Type, that is why Hive doesn't allow this. A workaround could be create the top level as Record and inside create two fields as record Type.
{
"type": "record",
"name": "myRecord",
"namespace": "com.test.app_list"
"fields": [
{
"type": "record",
"name": "Id",
"doc": "Device ID",
"fields": [
{
"name": "idType",
"type": "int"
},{
"name": "id",
"type": "string"
}
]
},
{
"type": "record",
"name": "AppList",
"doc": "",
"fields": [
{
"name": "appId",
"type": "string",
"avro.java.string": "String"
},
{
"name": "timestamp",
"type": "long"
},
{
"name": "idList",
"type": [{"type": "array", "items": "com.test.app_list.Id"}]
}
]
}
]
}

BQ command line how to use "--noflatten_results" option to have nested fields

I need to query table with nested and repeated fields, using bq command line give me flattened result while i need to get result as the original format.
The orignal format is looking like
{
"fields": [
{
"fields": [
{
"mode": "REQUIRED",
"name": "version",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "hash",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "header",
"type": "STRING"
},
{
"name": "organization",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "date",
"type": "TIMESTAMP"
},
{
"mode": "REQUIRED",
"name": "encoding",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "message_type",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "receiver_code",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "sender_code",
"type": "INTEGER"
},
{
"mode": "REQUIRED",
"name": "segment_separator",
"type": "STRING"
},
{
"fields": [
{
"fields": [
{
"name": "name",
"type": "STRING"
},
{
"name": "description",
"type": "STRING"
},
{
"name": "value",
"type": "STRING"
},
{
"fields": [
{
"name": "name",
"type": "STRING"
},
{
"name": "description",
"type": "STRING"
},
{
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "composite_elements",
"type": "RECORD"
}
],
"mode": "REPEATED",
"name": "elements",
"type": "RECORD"
},
{
"name": "description",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "segments",
"type": "RECORD"
},
{
"mode": "REQUIRED",
"name": "message_identifier",
"type": "INTEGER"
},
{
"mode": "REQUIRED",
"name": "element_separator",
"type": "STRING"
},
{
"name": "composite_element_separator",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "messages",
"type": "RECORD"
},
{
"mode": "REQUIRED",
"name": "syntax",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "encoding",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "file_name",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "size",
"type": "INTEGER"
}
]
}
So how to export (locally) data with the nesting representation ?
[EDIT]
Export to Google to have nested representation
It's seem the only solution to export the nesting representation it's to export to table then extract to Google Storage and finally download the file.
bq query --destination_table=DEV.EDI_DATA_EXPORT --replace \
--allow_large_results --noflatten_results \
"select * from DEV.EDI_DATA where syntax='EDIFACT' " \
&& bq extract --destination_format=NEWLINE_DELIMITED_JSON DEV.EDI_DATA_EXPORT gs://mybucket/data.json \
&& gsutil cp gs://mybucket/data.json .
It's surprising to me...
Whenever you use -noflatten_results you also have to use --allow_large_results and --destination_table. This stores the non-flattened results in a new table.

Resolving error: returned "Output field used as input"

I'm trying to create a BigQuery table using Python. Other operations (queries, retrieving table bodies etc.) are working fine, but when trying to create a table I'm stuck with an error:
apiclient.errors.HttpError: https://www.googleapis.com/bigquery/v2/projects/marechal-consolidation/datasets/marechal_results/tables?alt=json
returned "Output field used as input">
Here's the command I'm executing:
projectId = 'xxxx'
dataSet = 'marechal_results'
with open(filePath+'tableStructure.json') as data_file:
structure = json.load(data_file)
table_result = tables.insert(projectId=projectId, datasetId=dataSet, body=structure).execute()
JSON table:
{
"kind": "bigquery#table",
"tableReference": {
"projectId": "xxxx",
"tableId": "xxxx",
"datasetId": "xxxx"
},
"type": "table",
"schema": {
"fields": [
{
"mode": "REQUIRED",
"type": "STRING",
"description": "Company",
"name": "COMPANY"
},
{
"mode": "REQUIRED",
"type": "STRING",
"description": "Currency",
"name": "CURRENCY"
}
// bunch of other fields follow...
]
}
}
Why am I receiving this error?
EDIT: Here's the JSON object I'm passing as parameter:
{
"kind": "bigquery#table",
"type": "TABLE",
"tableReference": {
"projectId": "xxxx",
"tableId": "xxxx",
"datasetId": "xxxx"
},
"schema": {
"fields": [
{
"type": "STRING",
"name": "COMPANY"
},
{
"type": "STRING",
"name": "YEAR"
},
{
"type": "STRING",
"name": "COUNTRY_ISO"
},
{
"type": "STRING",
"name": "COUNTRY"
},
{
"type": "STRING",
"name": "COUNTRY_GROUP"
},
{
"type": "STRING",
"name": "REGION"
},
{
"type": "STRING",
"name": "AREA"
},
{
"type": "STRING",
"name": "BU"
},
{
"type": "STRING",
"name": "REFERENCE"
},
{
"type": "FLOAT",
"name": "QUANTITY"
},
{
"type": "FLOAT",
"name": "NET_SALES"
},
{
"type": "FLOAT",
"name": "GROSS_SALES"
},
{
"type": "STRING",
"name": "FAM_GRP"
},
{
"type": "STRING",
"name": "FAMILY"
},
{
"type": "STRING",
"name": "PRESENTATION"
},
{
"type": "STRING",
"name": "ORIG_FAMILY"
},
{
"type": "FLOAT",
"name": "REF_PRICE"
},
{
"type": "STRING",
"name": "CODE1"
},
{
"type": "STRING",
"name": "CODE4"
}
]
}
}
This is probably too late to help you but hopefully it helps the next poor soul like me. It took me a while figure out what "Output field used as input" meant.
Though the API specifies the same object for the request (input) and response (output), some fields are only allowed in the response. In the docs you will see their descriptions prefixed with "Output only". From looking at your table definition I see that you have "type": "TABLE" and "type" is listed as an "Output only" property. So I would gander that if you remove it then that error will go away. Here is the link to the docs: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables
It would help if they told you what field the violation was on.