Load the Json file in hive with dynamic JSON Schema - hive

I'm trying to load yelp_business JSON into hive table but not able to create schema for this json file in hive. Please let me know how to create schema for the below in hive. Depending on the business the attribute change in hive and I'm using json serde while creating a table.
{
"business_id": "8-NRKkPY1UiFXW20WXKiXg",
"name": "Filiberto's Mexican Food",
"neighborhood": "",
"address": "1440 N. Dysart Ave",
"city": "Avondale",
"state": "AZ",
"postal_code": "85323",
"latitude": 33.4481059352,
"longitude": -112.341302074,
"stars": 2.5,
"review_count": 40,
"is_open": 1,
"attributes": {
"Alcohol": "none",
"Ambi
ence": "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'c
asual': True}",
"BikeParking": "True",
"BusinessAcceptsCreditCards": "True",
"BusinessParking": "{'garage': False, 'street': False, 'validated': False, 'lot':
True, 'valet': False}",
"Caters": "False",
"GoodForKids": "True",
"GoodForMeal": "{'dessert': False, 'latenight': True, 'lunch': False, 'dinner': False, 'break
fast': False, 'brunch': False}",
"HasTV": "False",
"NoiseLevel": "average",
"OutdoorSeating": "False",
"RestaurantsAttire": "casual",
"RestaurantsDelivery": "False
",
"RestaurantsGoodForGroups": "True",
"RestaurantsPriceRange2": "1",
"RestaurantsReservations": "False",
"RestaurantsTableService": "False",
"RestaurantsTakeOut"
: "True",
"WheelchairAccessible": "True",
"WiFi": "no"
},
"categories": "Mexican, Restaurants",
"hours": {
"Monday": "0:0-0:0",
"Tuesday": "0:0-0:0",
"Wednesday": "0:0-0
:0",
"Thursday": "0:0-0:0",
"Friday": "0:0-0:0",
"Saturday": "0:0-0:0",
"Sunday": "0:0-0:0"
}
}
{
"business_id": "UTm5QZThPQlT35mkAcGOjg",
"name": "Maggie & Stella's Gifts",
"neighborhood": "Oakland",
"address": "209 Oakland Ave",
"city": "Pittsburgh",
"state"
: "PA",
"postal_code": "15213",
"latitude": 40.4414214,
"longitude": -79.9564571,
"stars": 3.5,
"review_count": 3,
"is_open": 1,
"attributes": {
"BikeParking": "True",
"Bu
sinessAcceptsCreditCards": "True",
"BusinessParking": "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}",
"RestaurantsPri
ceRange2": "2"
},
"categories": "Flowers & Gifts, Gift Shops, Shopping",
"hours": {
"Monday": "9:0-18:0",
"Tuesday": "9:0-18:0",
"Wednesday": "9:0-18:0",
"Thursday": "
9:0-18:0",
"Friday": "9:0-17:0",
"Saturday": "10:0-17:0"
}
}

Hive does not handle the dynamic schema.
What you can do is provide all the attributes weather applicable or not.

Related

Setting up Continuous export of API data to csv for Racing lap times

Have access to an API that provides data for racing, including driver names, and their last lap, best lap.. etc. Completely new to coding but learning the ropes. This is an example of an output from the API.
"Successful": true,
"Session": {
"RunNumber": "47",
"SessionName": "KART DRIVERS - EXPERIENCE / PROVA 11 22:30",
"TrackName": "KGV RACE TRACKS - CIRCUITO 109",
"TrackLength": "0.725",
"CurrentTime": "23:05:24",
"SessionTime": "00:13:27",
"TimeToGo": "00:04:32",
"LapsToGo": "9999",
"FlagStatus": "Green",
"SortMode": "race",
"Classes": {
"1": {
"ClassID": "1",
"Description": "RENTAL"
}
},
"Competitors": {
"018": {
"RacerID": "018",
"Number": "018",
"Transponder": "02",
"FirstName": "LR",
"LastName": "",
"Nationality": "",
"AdditionalData": "",
"ClassID": "1",
"Position": "28",
"Laps": "8",
"TotalTime": "00:12:34.376",
"BestPosition": "26",
"BestLap": "8",
"BestLapTime": "00:01:09.158",
"LastLapTime": "00:01:09.158"
},
"043": {
"RacerID": "043",
"Number": "043",
"Transponder": "48",
"FirstName": "LORENZO",
"LastName": "",
"Nationality": "",
"AdditionalData": "",
"ClassID": "1",
"Position": "32",
"Laps": "5",
"TotalTime": "00:12:54.095",
"BestPosition": "32",
"BestLap": "4",
"BestLapTime": "00:01:38.740",
"LastLapTime": "00:02:39.277"
How would I go about reading data from this api every 30 seconds to 1 minute, and exporting that data into a CSV to put on excel?

How to parse Json array and nested values?

I am trying to parse all arrays and nested value in given Json but for nested blocks, it creates 3 distinct recs instead one. Is there a way to display only one rec instead 3 without doing union or group by?
"#odata.context": "https://graph.microsoft.com/v1.0/$metadata#auditLogs/signIns",
"value": [
{
"id": "a3ac3bec-4c4e-42c8-a11c-068f3dfda201",
"createdDateTime": "2021-08-31T18:00:44Z",
"userDisplayName": "abc",
"userPrincipalName": "sad2547#gmail.com",
"userId": "36a3a1f2-6133-4a0b-a6c9-020693ebdbd3",
"appId": "1fa516bf-1332-4140-85c9-d844d4e69ca1",
"appDisplayName": "ProxyIdentityExperienceFramework",
"ipAddress": "999.99.0.999",
"clientAppUsed": "Mobile Apps and Desktop clients",
"correlationId": "c478bdd4-1541-4cd0-bf7e-bd0695325246",
"conditionalAccessStatus": "notApplied",
"isInteractive": true,
"riskDetail": "hidden",
"riskLevelAggregated": "hidden",
"riskLevelDuringSignIn": "hidden",
"riskState": "none",
"riskEventTypes": [],
"riskEventTypes_v2": [],
"resourceDisplayName": "IdentityExperienceFramework",
"resourceId": "a3c649c7-5daa-4c3f-a5a0-a3fd7281ee20",
"status": {
"errorCode": 0,
"failureReason": "Other.",
"additionalDetails": null
},
"deviceDetail": {
"deviceId": "",
"displayName": "",
"operatingSystem": "Windows 10",
"browser": "Chrome 92.0.4515",
"isCompliant": false,
"isManaged": false,
"trustType": ""
},
"location": {
"city": "xyz",
"state": "def",
"countryOrRegion": "US",
"geoCoordinates": {
"altitude": null,
"latitude": 12.65875,
"longitude": -74.65286
}
},
"appliedConditionalAccessPolicies": []
},
{
"id": "a3ac3bec-4c4e-42c8-a11c-068f3dfda201",
"createdDateTime": "2021-08-31T18:00:44Z",
"userDisplayName": "abc",
"userPrincipalName": "sad2547#gmail.com",
"userId": "36a3a1f2-6133-4a0b-a6c9-020693ebdbd3",
"appId": "1fa516bf-1332-4140-85c9-d844d4e69ca1",
"appDisplayName": "ProxyIdentityExperienceFramework",
"ipAddress": "999.99.0.999",
"clientAppUsed": "Mobile Apps and Desktop clients",
"correlationId": "c478bdd4-1541-4cd0-bf7e-bd0695325246",
"conditionalAccessStatus": "notApplied",
"isInteractive": true,
"riskDetail": "hidden",
"riskLevelAggregated": "hidden",
"riskLevelDuringSignIn": "hidden",
"riskState": "none",
"riskEventTypes": [],
"riskEventTypes_v2": [],
"resourceDisplayName": "IdentityExperienceFramework",
"resourceId": "a3c649c7-5daa-4c3f-a5a0-a3fd7281ee20",
"status": {
"errorCode": 1,
"failureReason": "Other.",
"additionalDetails": null
},
"deviceDetail": {
"deviceId": "",
"displayName": "",
"operatingSystem": "Windows 10",
"browser": "Chrome 92.0.4505",
"isCompliant": false,
"isManaged": false,
"trustType": ""
},
"location": {
"city": "abc",
"state": "def",
"countryOrRegion": "US",
"geoCoordinates": {
"altitude": null,
"latitude": 12.65875,
"longitude": -74.65286
}
},
"appliedConditionalAccessPolicies": []
]
}
SQL:
SELECT x.*
FROM DEMO_JSON a,
JSON_TABLE(a.DOC, '$.value[*]'
COLUMNS (
IDs VARCHAR2(100) PATH id,
nested path status columns(
code path errorCode),
nested path deviceDetail columns(
browser path browser
),
nested path location columns(
city path city)
)
) X;
Output:
IDS CODE BROWSER CITY
a3ac3bec-4c4e-42c8-a11c-068f3dfda201 0
a3ac3bec-4c4e-42c8-a11c-068f3dfda201 Chrome 92.0.4515
a3ac3bec-4c4e-42c8-a11c-068f3dfda201 xyz
a3ac3bec-4c4e-42c8-a11c-068f3dfda201 1
a3ac3bec-4c4e-42c8-a11c-068f3dfda201 Chrome 92.0.4505
a3ac3bec-4c4e-42c8-a11c-068f3dfda201 abc```

azure search exact match of file name not returning exact results

I am indexing all the file names into the index. But when I search with exact file name in the search query it is returning all other file names also. below is my index definition.
{
"fields": [
{
"name": "id",
"type": "Edm.String",
"facetable": true,
"filterable": true,
"key": true,
"retrievable": true,
"searchable": false,
"sortable": false,
"analyzer": null,
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
},
{
"name": "FileName",
"type": "Edm.String",
"facetable": false,
"filterable": false,
"key": false,
"retrievable": true,
"searchable": true,
"sortable": false,
"analyzer": "keyword-analyzer",
"indexAnalyzer": null,
"searchAnalyzer": null,
"synonymMaps": [],
"fields": []
}
],
"scoringProfiles": [],
"defaultScoringProfile": null,
"corsOptions": null,
"analyzers": [
{
"name": "keyword-analyzer",
"#odata.type": "#Microsoft.Azure.Search.CustomAnalyzer",
"charFilters": [],
"tokenizer": "keyword_v2",
"tokenFilters": ["lowercase", "my_asciifolding", "my_word_delimiter"]
}
],
"tokenFilters": [
{
"#odata.type": "#Microsoft.Azure.Search.AsciiFoldingTokenFilter",
"name": "my_asciifolding",
"preserveOriginal": true
},
{
"#odata.type": "#Microsoft.Azure.Search.WordDelimiterTokenFilter",
"name": "my_word_delimiter",
"generateWordParts": true,
"generateNumberParts": false,
"catenateWords": false,
"catenateNumbers": false,
"catenateAll": false,
"splitOnCaseChange": true,
"preserveOriginal": true,
"splitOnNumerics": true,
"stemEnglishPossessive": false,
"protectedWords": []
}
],
"#odata.etag": "\"0x8D6FB2F498F9AD2\""
}
Below is my sample data
{
"value": [
{
"id": "1",
"FileName": "SamplePSDFile_1psd2680.psd"
},
{
"id": "2",
"FileName": "SamplePSDFile-1psd260.psd"
},
{
"id": "3",
"FileName": "SamplePSDFile_1psd2689.psd"
},
{
"id": "4",
"FileName": "SamplePSDFile-1psdxx2680.psd"
}
]
}
Below is the Analyze API results
{
"tokens": [
{
"token": "samplepsdfile_1psd2689.psd",
"startOffset": 0,
"endOffset": 26,
"position": 0
},
{
"token": "samplepsdfile",
"startOffset": 0,
"endOffset": 13,
"position": 0
},
{
"token": "psd",
"startOffset": 15,
"endOffset": 18,
"position": 1
},
{
"token": "psd",
"startOffset": 23,
"endOffset": 26,
"position": 2
}
]
}
When I search with the keyword "SamplePSDFile_1psd2689.psd", Azure search returning three records in the results instead of only document 3. Below is my search query and the results.
?search="SamplePSDFile_1psd2689.psd"&api-version=2019-05-06&$count=true&queryType=full&searchMode=All
{
"#odata.count": 3,
"value": [
{
"#search.score": 2.3387241,
"id": "2",
"FileName": "SamplePSDFile-1psd260.psd"
},
{
"#search.score": 2.2493405,
"id": "3",
"FileName": "SamplePSDFile_1psd2689.psd"
},
{
"#search.score": 2.2493405,
"id": "1",
"FileName": "SamplePSDFile_1psd2680.psd"
}
]
}
How I can achieve my expected results. I tried with and without double quotes around the keyword all other options, but no luck. What I am doing wrong here in this case?
Some body suggested to use $filter, but that field wasn't filterable in our case.
Please help me on this.
If you are looking for exact match then you probably don't want any analyzer involved. Give it a try with this line
"analyzer": "keyword-analyzer"
changed to
"analyzer": null
If you need to be able to do exact match on the field and also support partial keyword searches then you need to index the field twice with different names. Maybe append “Exact” to the exact match field name and don’t use an analyzer for that one. The name without exact can have an analyzer. Then search on the field using the right field name index depending on the type of search.

Graph API doesn't restore a mail message, instead it is creating a new message with CreateDateTime automatically updated to present date

When I am performing restore of an email message via graph API with a Post request, instead of restoring it is creating a new message with the same data. Because in the JSON createDateTime is being updated although I am passing previous createDataTime.
To elaborate more: I want to restore below mail message which got created in 2018 ( "createdDateTime": "2018-12-31T14:49:42Z") but when I am posting same JSON for restore, createDateTime is being updated automatically to the present date. Which is problem because it's not the restore, it is just like creating new message.
{
"#odata.type": "#microsoft.graph.eventMessageResponse",
"#odata.etag": "W/\"DAAAABYAAABjFtMyIejaSbuRSeM/auJwAAGfpJnO\"",
"id": "AAMkAGZiNGI0MWM4LTQ0NjUtNDUyMy1hOTI2LWNopaTZiMGYxZTBkNQBGAAAAAACaBIVNrajXSj6AQcjiAFBwBjFtMyIejaSbuRSeM-auJwAAAAAAEJAABjFtMyIejaSbuRSeM-auJwAAGf4eRfAAA=",
"createdDateTime": "2018-12-31T14:49:42Z",
"lastModifiedDateTime": "2020-12-31T14:49:46Z",
"changeKey": "DopskAkslaAABjFtMyIejaSbuRSeM/auJwAAGfpJnO",
"categories": [],
"receivedDateTime": "2020-12-31T14:49:43Z",
"sentDateTime": "2020-12-31T14:49:42Z",
"hasAttachments": false,
"internetMessageId": "<MA1PR0101MB207oPF15907003958DB7A58BDD60#MA1PR0101MB2070.INDPRD01.PROD.OUTLOOK.COM>",
"subject": "Accepted: New Year Party",
"bodyPreview": "",
"importance": "normal",
"parentFolderId": "AQMkAGZiNGI0MWM4LTQ0ADY1LTQ1MjMtYTkyNi1jZGU2YjBmMWUwZDUALgAAA5oEhU2tqNdKuqPoBByOIAlkallspspspspspppAAAIBCQAAAA==",
"conversationId": "AAQkAGZiNGI0MWM4LTQ0NjUtNDUyMy1hOTI2LWNkZTZiMGYxZTBkNQAQAEJ5AU8Tk1nklXE3E0XGh2w=",
"conversationIndex": "AQHW34QsrZ0Wy3deoU2Bn2byefNABQ==",
"isDeliveryReceiptRequested": null,
"isReadReceiptRequested": false,
"isRead": true,
"isDraft": false,
"inferenceClassification": "focused",
"meetingMessageType": "meetingAccepted",
"type": "singleInstance",
"isOutOfDate": false,
"isAllDay": false,
"isDelegated": false,
"responseType": "accepted",
"recurrence": null,
"body": {
"contentType": "text",
"content": ""
},
"sender": {
"emailAddress": {
"name": "Mark Rober",
"address": "mark#securemigration.in"
}
},
"from": {
"emailAddress": {
"name": "Mark Rober",
"address": "mark#securemigration.in"
}
},
"toRecipients": [
{
"emailAddress": {
"name": "#Class Yammer",
"address": "ClassYammer#securemigration.in"
}
}
],
"ccRecipients": [],
"bccRecipients": [],
"replyTo": [],
"flag": {
"flagStatus": "notFlagged"
},
"startDateTime": {
"dateTime": "2020-12-31T15:00:00.0000000",
"timeZone": "UTC"
},
"endDateTime": {
"dateTime": "2020-12-31T15:30:00.0000000",
"timeZone": "UTC"
}
}
Please help me with it.

How to get list of Users under a certain Manager/Approver in Coupa API?

I'm lost and I'm hoping that someone may have worked on this before.
So Coupa has its API:
https://coupadocs.atlassian.net/wiki/display/integrate/Users+API
I was able to retrieve user information together with the corresponding manager. Sample response:
https://unknownserver-test.coupahost.com/api/users?employee-number=10003323
[
{
"id": 2756,
"created-at": "2017-03-30T09:29:19-05:00",
"updated-at": "2017-03-31T04:30:53-05:00",
"login": "user1.user1",
"email": "staging23#coupa.com",
"purchasing-user": false,
"expense-user": false,
"sourcing-user": false,
"inventory-user": false,
"employee-number": "10003323",
"phone-work": null,
"phone-mobile": null,
"firstname": "user1",
"lastname": "user1",
"fullname": "user1 user1",
"api-user": false,
"active": false,
"salesforce-id": null,
"account-security-type": 0,
"authentication-method": "coupa_credentials",
"sso-identifier": null,
"default-locale": null,
"default-account": null,
"business-group-security-type": null,
"edit-invoice-on-quick-entry": false,
"avatar-thumb-url": null,
"mention-name": "user1user1",
"company-employee-id": "10003323",
"netsuite-employee-id": "10003323",
"subsidiary": {
"id": 1592,
"external-ref-num": null,
"external-ref-code": "company North America:1"
},
"job-title": {
"id": 2591,
"external-ref-num": null,
"external-ref-code": "VP, Sales"
},
"employee-type": "",
"default-expense-region": "",
"default-geo-spend": "",
"notes": "",
"exclude-from-autosarf": "",
"roles": [
{
"id": 10,
"name": "Expense User"
}
],
"manager": {
"id": 838,
"login": "john.doe",
"email": "staging#coupa.com"
},
"default-currency": {
"id": 1,
"code": "USD"
},
"department": {
"id": 342,
"name": "Sales - Exec:176"
},
"expenses-delegated-to": [],
"can-expense-for": [],
"content-groups": [],
"account-groups": [],
"approval-groups": [],
"working-warehouses": [],
"inventory-organizations": [],
"created-by": {
"id": 2748,
"login": "user1 creator",
"email": "user1.creator#company.com"
},
"updated-by": {
"id": 2748,
"login": "user1 creator",
"email": "user1.creator#company.com"
}
}
]
What I've tried are these:
https://unknownserver-test.coupahost.com/api/users?user[manager][id]=838&return_object=shallow
https://unknownserver-test.coupahost.com/api/users?manager[id]=838&return_object=shallow
https://unknownserver-test.coupahost.com/api/users?users[user][manager][id]=838&return_object=shallow
https://{{URL PREFIX}}.{{HOST}}.com/api/users?manager_id=838&return_object=shallow
If you only need the IDs of the users, you'd get better performance with return_object=limited
If there are more than 50 users returned, you'll have to paginate with the offset query param.