Avro Schema: multiple records reference same data type issue: Unknown union branch - schema

I have Avro Schema: customer record import the CustomerAddress subset.
[
{
"type": "record",
"namespace": "com.example",
"name": "CustomerAddress",
"fields": [
{ "name": "address", "type": "string" },
{ "name": "city", "type": "string" },
{ "name": "postcode", "type": ["string", "int"] },
{ "name": "type","type": {"type": "enum","name": "type","symbols": ["POBOX","RESIDENTIAL","ENTERPRISE"]}}
]
},
{
"type": "record",
"namespace": "com.example",
"name": "Customer",
"fields": [
{ "name": "first_name", "type": "string" },
{ "name": "middle_name", "type": ["null", "string"], "default": null },
{ "name": "last_name", "type": "string" },
{ "name": "age", "type": "int" },
{ "name": "height", "type": "float" },
{ "name": "weight", "type": "float" },
{ "name": "automated_email", "type": "boolean", "default": true },
{ "name": "customer_emails", "type": {"type": "array","items": "string"},"default": []},
{ "name": "customer_address", "type": "com.example.CustomerAddress" }
]
}
]
i have JSON payload:
{
"Customer" : {
"first_name": "John",
"middle_name": null,
"last_name": "Smith",
"age": 25,
"height": 177.6,
"weight": 120.6,
"automated_email": true,
"customer_emails": ["ning.chang#td.com", "test#td.com"],
"customer_address":
{
"address": "21 2nd Street",
"city": "New York",
"postcode": "10021",
"type": "RESIDENTIAL"
}
}
}
when i runt the command: java -jar avro-tools-1.8.2.jar fromjson --schema-file customer.avsc customer.json
got the following exception:
Exception in thread "main" org.apache.avro.AvroTypeException: Unknown union branch Customer

In your JSON data you use the key Customer but you have to use the fully qualified name. So it should be com.example.Customer.

Related

Amazon Personalize dataset import job creation failed

My schema look is like:
{
"type": "record",
"name": "Interactions",
"namespace": "com.amazonaws.personalize.schema",
"fields": [
{
"name": "USER_ID",
"type": "string"
},
{
"name": "ITEM_ID",
"type": "string"
},
{
"name": "TIMESTAMP",
"type": "long"
},
{
"name": "EVENT_TYPE",
"type": "string"
},
{
"name": "EVENT_VALUE",
"type": "float"
},
{
"name": "SESSION_ID",
"type": "string"
},
{
"name": "SERVICE_TYPE",
"type": "string"
},
{
"name": "SERVICE_LOCATION",
"type": "string"
},
{
"name": "SERVICE_PRICE",
"type": "int"
},
{
"name": "SERVICE_TIME",
"type": "long"
},
{
"name": "USER_LOCATION",
"type": "string"
}
]
}
I uploaded my .CSV file in S3 bucket user-flights-bucket. When I tried to uploaded it to personalize it failed with the reason:
Path does not exist: s3://user-flights-bucket/null;
S3://user-flights-bucket/"give ur file name.csv"
It will work..give this in the data location

Concatenate/ build Json objects From PostgresSQl database

Im trying to build Json array from data existing in a database.
I shall build a Json file that shall match the following Json-file with PostgresSQL.
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Student iformation",
"type": "object",
"required": [
"student",
"name",
"login",
"program",
"branch",
"finished",
],
"properties": {
"student": {
"type": "string",
"minLength": 10,
"maxLength": 10,
"title": "A national identification number, 10 digits"
},
"name": {
"type": "string",
"title": "The name of the student"
},
"login": {
"type": "string",
"title": "The univerity issued computer login"
},
"program": {
"type": "string",
},
"branch": {
"anyOf":[{"type": "string"},{"type": "null"}],
},
"finished": {
"type": "array",
"title": "A list of read courses",
"items": {
"type": "object",
"required": [
"course",
"code",
"credits",
"grade"
],
"properties": {
"course": {
"type": "string",
"title": "Course name"
},
"code": {
"type": "string",
"minLength": 6,
"maxLength": 6,
"title": "Course code"
},
"credits": {
"type": "number",
"title": "Academic credits"
},
"grade": {
"enum" : ["U", "3", "4", "5"]
}
}
}
}
I have tried to do the following to get a better understanding of how to concatenate, build and arrange data that exists in the database:
SELECT array_to_json(array_agg(row_to_json(t))) FROM (
SELECT idnr, name, login, program from students) t;
and
select json_build_object('properties',
json_build_object('student',
json_build_object('idnr',idnr),'name',
json_build_object('name',name),'login',
json_build_object('login',login),'program',
json_build_object('program',program),'branch',
json_build_object('branch',branch)))
from Basicinformation;
How do I build, concatenate objects with PostgresSQL?

BQ command line how to use "--noflatten_results" option to have nested fields

I need to query table with nested and repeated fields, using bq command line give me flattened result while i need to get result as the original format.
The orignal format is looking like
{
"fields": [
{
"fields": [
{
"mode": "REQUIRED",
"name": "version",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "hash",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "header",
"type": "STRING"
},
{
"name": "organization",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "date",
"type": "TIMESTAMP"
},
{
"mode": "REQUIRED",
"name": "encoding",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "message_type",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "receiver_code",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "sender_code",
"type": "INTEGER"
},
{
"mode": "REQUIRED",
"name": "segment_separator",
"type": "STRING"
},
{
"fields": [
{
"fields": [
{
"name": "name",
"type": "STRING"
},
{
"name": "description",
"type": "STRING"
},
{
"name": "value",
"type": "STRING"
},
{
"fields": [
{
"name": "name",
"type": "STRING"
},
{
"name": "description",
"type": "STRING"
},
{
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "composite_elements",
"type": "RECORD"
}
],
"mode": "REPEATED",
"name": "elements",
"type": "RECORD"
},
{
"name": "description",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "segments",
"type": "RECORD"
},
{
"mode": "REQUIRED",
"name": "message_identifier",
"type": "INTEGER"
},
{
"mode": "REQUIRED",
"name": "element_separator",
"type": "STRING"
},
{
"name": "composite_element_separator",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "messages",
"type": "RECORD"
},
{
"mode": "REQUIRED",
"name": "syntax",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "encoding",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "file_name",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "size",
"type": "INTEGER"
}
]
}
So how to export (locally) data with the nesting representation ?
[EDIT]
Export to Google to have nested representation
It's seem the only solution to export the nesting representation it's to export to table then extract to Google Storage and finally download the file.
bq query --destination_table=DEV.EDI_DATA_EXPORT --replace \
--allow_large_results --noflatten_results \
"select * from DEV.EDI_DATA where syntax='EDIFACT' " \
&& bq extract --destination_format=NEWLINE_DELIMITED_JSON DEV.EDI_DATA_EXPORT gs://mybucket/data.json \
&& gsutil cp gs://mybucket/data.json .
It's surprising to me...
Whenever you use -noflatten_results you also have to use --allow_large_results and --destination_table. This stores the non-flattened results in a new table.

Resolving error: returned "Output field used as input"

I'm trying to create a BigQuery table using Python. Other operations (queries, retrieving table bodies etc.) are working fine, but when trying to create a table I'm stuck with an error:
apiclient.errors.HttpError: https://www.googleapis.com/bigquery/v2/projects/marechal-consolidation/datasets/marechal_results/tables?alt=json
returned "Output field used as input">
Here's the command I'm executing:
projectId = 'xxxx'
dataSet = 'marechal_results'
with open(filePath+'tableStructure.json') as data_file:
structure = json.load(data_file)
table_result = tables.insert(projectId=projectId, datasetId=dataSet, body=structure).execute()
JSON table:
{
"kind": "bigquery#table",
"tableReference": {
"projectId": "xxxx",
"tableId": "xxxx",
"datasetId": "xxxx"
},
"type": "table",
"schema": {
"fields": [
{
"mode": "REQUIRED",
"type": "STRING",
"description": "Company",
"name": "COMPANY"
},
{
"mode": "REQUIRED",
"type": "STRING",
"description": "Currency",
"name": "CURRENCY"
}
// bunch of other fields follow...
]
}
}
Why am I receiving this error?
EDIT: Here's the JSON object I'm passing as parameter:
{
"kind": "bigquery#table",
"type": "TABLE",
"tableReference": {
"projectId": "xxxx",
"tableId": "xxxx",
"datasetId": "xxxx"
},
"schema": {
"fields": [
{
"type": "STRING",
"name": "COMPANY"
},
{
"type": "STRING",
"name": "YEAR"
},
{
"type": "STRING",
"name": "COUNTRY_ISO"
},
{
"type": "STRING",
"name": "COUNTRY"
},
{
"type": "STRING",
"name": "COUNTRY_GROUP"
},
{
"type": "STRING",
"name": "REGION"
},
{
"type": "STRING",
"name": "AREA"
},
{
"type": "STRING",
"name": "BU"
},
{
"type": "STRING",
"name": "REFERENCE"
},
{
"type": "FLOAT",
"name": "QUANTITY"
},
{
"type": "FLOAT",
"name": "NET_SALES"
},
{
"type": "FLOAT",
"name": "GROSS_SALES"
},
{
"type": "STRING",
"name": "FAM_GRP"
},
{
"type": "STRING",
"name": "FAMILY"
},
{
"type": "STRING",
"name": "PRESENTATION"
},
{
"type": "STRING",
"name": "ORIG_FAMILY"
},
{
"type": "FLOAT",
"name": "REF_PRICE"
},
{
"type": "STRING",
"name": "CODE1"
},
{
"type": "STRING",
"name": "CODE4"
}
]
}
}
This is probably too late to help you but hopefully it helps the next poor soul like me. It took me a while figure out what "Output field used as input" meant.
Though the API specifies the same object for the request (input) and response (output), some fields are only allowed in the response. In the docs you will see their descriptions prefixed with "Output only". From looking at your table definition I see that you have "type": "TABLE" and "type" is listed as an "Output only" property. So I would gander that if you remove it then that error will go away. Here is the link to the docs: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables
It would help if they told you what field the violation was on.

bigquery - Input contained no data

I'm testing bigquery platform with real traffic of my site (more than 80M of events by day).
I'm uploading gz files using java api, using insert jobs.
In some cases, i've receive this message: Input contained no data
{
"kind": "bigquery#job",
"etag": "\"******************\"",
"id": "*********",
"selfLink": "********",
"jobReference": {
"projectId": "********",
"jobId": "**************"
},
"configuration": {
"load": {
"schema": {
"fields": [
{
"name": "tms",
"type": "TIMESTAMP"
},
{
"name": "page",
"type": "STRING"
},
{
"name": "user_agent",
"type": "STRING"
},
{
"name": "print_id",
"type": "STRING"
},
{
"name": "referer",
"type": "STRING"
},
{
"name": "gtms",
"type": "TIMESTAMP"
},
{
"name": "cookies",
"type": "STRING"
},
{
"name": "ip",
"type": "STRING"
},
{
"name": "site",
"type": "STRING"
},
{
"name": "call_params",
"type": "STRING"
},
{
"name": "domains",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "name",
"type": "STRING"
},
{
"name": "ads",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "id",
"type": "STRING"
},
{
"name": "type",
"type": "STRING"
},
{
"name": "position",
"type": "STRING"
},
{
"name": "strategy",
"type": "STRING"
},
{
"name": "score",
"type": "STRING"
},
{
"name": "cpc",
"type": "STRING"
},
{
"name": "site",
"type": "STRING"
},
{
"name": "categ",
"type": "STRING"
},
{
"name": "cust",
"type": "STRING"
},
{
"name": "campaign",
"type": "STRING"
}
]
}
]
}
]
},
"destinationTable": {
"projectId": "**********",
"datasetId": "*******",
"tableId": "********"
},
"createDisposition": "CREATE_IF_NEEDED",
"writeDisposition": "WRITE_APPEND",
"sourceFormat": "NEWLINE_DELIMITED_JSON"
}
},
"status": {
"state": "DONE",
"errors": [
{
"reason": "invalid",
"message": "Input contained no data"
}
]
},
"statistics": {
"creationTime": "1416491042309",
"startTime": "1416491061440",
"endTime": "1416491076876",
"load": {
"inputFiles": "1",
"inputFileBytes": "0",
"outputRows": "0",
"outputBytes": "0"
}
}
}
And then of this, all my jobs return the same response.
Can anybody tell me what is the reason of this behaviour?
Thanks!!!!
Your job succeeded: there is no "errorResult" field in the status.
First, I understand this mistake: the return of errors and warnings in the job api is, frankly, as clear as mud.
Here's the quick overview:
status.errorResult is where job error is reported. If no errorResult is reported, the job succeeded.
status.errors is where individual errors and warnings are reported.
Please reference the documentation https://cloud.google.com/bigquery/docs/reference/v2/jobs and search for status.errorResult and status.errors.
Most people don't hit this problem since a job only encountering a warning is pretty rare.
Ok, the problem was very simple: the gz file.
Thanks!