Uploading JSON to BigQuery - google-bigquery

Update: I will answer this myself immediately (this code works):
My customized upload code based off: https://developers.google.com/bigquery/loading-data-into-bigquery#loaddatapostrequest
import sys
import json
from apiclient.discovery import build
from oauth2client.file import Storage
from oauth2client.client import AccessTokenRefreshError
from oauth2client.client import OAuth2WebServerFlow
from oauth2client.tools import run
from apiclient.errors import HttpError
import httplib2
FLOW = OAuth2WebServerFlow(
client_id='xxxxxxx.apps.googleusercontent.com',
client_secret='shhhhhhhhhhhh',
scope='https://www.googleapis.com/auth/bigquery',
user_agent='my-program-name/1.0')
def loadTable(http, service):
projectId = 'drc-compute'
datasetId = 'standing'
import time
tableId = 'test_' + str(int(time.time()))
url = "https://www.googleapis.com/upload/bigquery/v2/projects/" + projectId + "/jobs"
schema = open('test_schema.json', 'r')
# Create the body of the request, separated by a boundary of xxx
newresource = ('--xxx\n' +
'Content-Type: application/json; charset=UTF-8\n' + '\n' +
'{\n' +
' "configuration": {\n' +
' "load": {\n' +
' "sourceFormat": "NEWLINE_DELIMITED_JSON",\n' +
' "schema": {\n'
' "fields": ' + schema.read() + '\n' +
' },\n' +
' "destinationTable": {\n' +
' "projectId": "' + projectId + '",\n' +
' "datasetId": "' + datasetId + '",\n' +
' "tableId": "' + tableId + '"\n' +
' }\n' +
' }\n' +
' }\n' +
'}\n' +
'--xxx\n' +
'Content-Type: application/octet-stream\n' +
'\n')
# Append data from the specified file to the request body
f = open('test.json', 'r')
newresource += f.read().replace('\n', '\r\n')
# Signify the end of the body
newresource += ('--xxx--\n')
print newresource
headers = {'Content-Type': 'multipart/related; boundary=xxx'}
resp, content = http.request(url, method="POST", body=newresource, headers=headers)
if not resp.status == 200:
print resp
print content
else:
jsonResponse = json.loads(content)
jobReference = jsonResponse['jobReference']['jobId']
import time
while True:
jobCollection = service.jobs()
getJob = jobCollection.get(projectId=projectId, jobId=jobReference).execute()
currentStatus = getJob['status']['state']
if 'DONE' == currentStatus:
print "Done Loading!"
return
else:
print 'Waiting to load...'
print 'Current status: ' + currentStatus
print time.ctime()
time.sleep(10)
def main(argv):
# If the credentials don't exist or are invalid, run the native client
# auth flow. The Storage object will ensure that if successful the good
# credentials will get written back to a file.
storage = Storage('bigquery2.dat') # Choose a file name to store the credentials.
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = run(FLOW, storage)
# Create an httplib2.Http object to handle our HTTP requests and authorize it
# with our good credentials.
http = httplib2.Http()
http = credentials.authorize(http)
service = build('bigquery','v2', http=http)
#datasets = service.datasets().list(projectId='917370487687').execute()
loadTable(http, service)
if __name__ == '__main__':
main(sys.argv)
You'll need your own bigquery client_id and client_secret to replicate in addition to running it once on a machine you can open a browser and log in to google with. Then bigquery2.dat will then store oauth2 refresh tokens and such. The simple test data I'm playing with is just:
test.json
{"asdf": "dd"}
{"asdf": "ax"}
test_schema.json
[
{
"type": "STRING",
"name": "asdf",
"mode": "NULLABLE"
}
]

To not leave this question open, as you already answered it in the question section - thanks to #noonien comment:
"remember to set sourceFormat within the load properties to
NEWLINE_DELIMITED_JSON"

Related

Dropbox - Automatic Refresh token Using oauth 2.0 with offlineaccess

I now: the automatic token refreshing is not a new topic.
This is the use case that generate my problem: let's say that we want extract data from Dropbox. Below you can find the code: for the first time works perfectly: in fact 1) the user goes to the generated link; 2) after allow the app coping and pasting the authorization code in the input box.
The problem arise when some hours after the user wants to do the same operation. How to avoid or by-pass the newly generation of authorization code and go straight to the operation?enter code here
As you can see in the code in a short period is possible reinject the auth code inside the code (commented in the code). But after 1 hour or more this is not loger possible.
Any help is welcome.
#!/usr/bin/env python3
import dropbox
from dropbox import DropboxOAuth2FlowNoRedirect
'''
Populate your app key in order to run this locally
'''
APP_KEY = ""
auth_flow = DropboxOAuth2FlowNoRedirect(APP_KEY, use_pkce=True, token_access_type='offline')
target='/DVR/DVR/'
authorize_url = auth_flow.start()
print("1. Go to: " + authorize_url)
print("2. Click \"Allow\" (you might have to log in first).")
print("3. Copy the authorization code.")
auth_code = input("Enter the authorization code here: ").strip()
#auth_code="3NIcPps_UxAAAAAAAAAEin1sp5jUjrErQ6787_RUbJU"
try:
oauth_result = auth_flow.finish(auth_code)
except Exception as e:
print('Error: %s' % (e,))
exit(1)
with dropbox.Dropbox(oauth2_refresh_token=oauth_result.refresh_token, app_key=APP_KEY) as dbx:
dbx.users_get_current_account()
print("Successfully set up client!")
for entry in dbx.files_list_folder(target).entries:
print(entry.name)
def dropbox_list_files(path):
try:
files = dbx.files_list_folder(path).entries
files_list = []
for file in files:
if isinstance(file, dropbox.files.FileMetadata):
metadata = {
'name': file.name,
'path_display': file.path_display,
'client_modified': file.client_modified,
'server_modified': file.server_modified
}
files_list.append(metadata)
df = pd.DataFrame.from_records(files_list)
return df.sort_values(by='server_modified', ascending=False)
except Exception as e:
print('Error getting list of files from Dropbox: ' + str(e))
#function to get the list of files in a folder
def create_links(target, csvfile):
filesList = []
print("creating links for folder " + target)
files = dbx.files_list_folder('/'+target)
filesList.extend(files.entries)
print(len(files.entries))
while(files.has_more == True) :
files = dbx.files_list_folder_continue(files.cursor)
filesList.extend(files.entries)
print(len(files.entries))
for file in filesList :
if (isinstance(file, dropbox.files.FileMetadata)) :
filename = file.name + ',' + file.path_display + ',' + str(file.size) + ','
link_data = dbx.sharing_create_shared_link(file.path_lower)
filename += link_data.url + '\n'
csvfile.write(filename)
print(file.name)
else :
create_links(target+'/'+file.name, csvfile)
#create links for all files in the folder belgeler
create_links(target, open('links.csv', 'w', encoding='utf-8'))
listing = dbx.files_list_folder(target)
#todo: add implementation for files_list_folder_continue
for entry in listing.entries:
if entry.name.endswith(".pdf"):
# note: this simple implementation only works for files in the root of the folder
res = dbx.sharing_get_shared_links(
target + entry.name)
#f.write(res.content)
print('\r', res)

Airflow how to write API request results to BigQuery table or GCS file

I am a novice trying to use Airflow to create a simple pipeline which gets data from an API and stores it in a BigQuery table. I have been successful decoding the json returned by the API and printing it out. I am stuck trying to figure out how to store that data into a file in GCS or somewhere so that then use I can use GCStoBigqueryOperator to load the data into BigQuery. I haven't found any operator that allows me to create a file, populate it and store it in a bucket.
Also is this the right approach for this pipeline?
Thank you
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.bash import BashOperator
from airflow.operators.email import EmailOperator
from datetime import datetime, timedelta
from urllib.request import urlopen
from urllib.request import Request
import json
default_args = {
"owner": "airflow",
"email_on_failure": False,
"email_on_retry": False,
"email": "admin#localhost.com",
"retries": 1,
"retry_delay": timedelta(minutes=5)
}
headers = {
'Content-Type': 'application/json',
'api-auth-accountid': '00000-0rererekl09rtlkjescgy0-0-ete90-et',
'api-auth-applicationkey': 'rre9gr45jk34594gft-3it30it'
}
def download_sales():
request1 = Request(
'https://inventory.dvvvvvvtems.com/ExternalApi/v2/saleList?Limit=50&CreatedSince=2021-1-1', headers=headers)
sale_list = urlopen(request1).read()
dec_sale_list = json.loads(sale_list)
for sale in dec_sale_list['SaleList']:
print("sale ID: " + sale['SaleID'] + " Customer:" + sale['Customer'] + " Order Date: " + sale['OrderDate'])
s_id = sale['SaleID']
request2 = Request(f'https://inventory.dfakesysjhghtems.com/ExternalApi/t3/sale/order?SaleID={s_id}', headers=headers)
order_list = urlopen(request2).read()
dec_item_list = json.loads(order_list)
for sku in dec_item_list['Lines']:
print("SKU: " + sku['SKU'] + " Product: " + sku['Name'] + " Quantity: " + str(sku['Quantity']) + " Price: " + str(sku['Price'])+ " Total: " + str(sku['Total']))
with DAG("sales_data_pipeline", start_date=datetime(2021, 1 ,1),
schedule_interval="#daily", default_args=default_args, catchup=False) as dag:
downloading_sales = PythonOperator(
task_id="downloading_sales",
python_callable=download_sales

Invalid AWS Access Key when calling S3 from Apex

I am attempting to access s3 from Apex using credentials returned from AssumeRole. However, I am receiving the following error:
<Message>The AWS Access Key Id you provided does not exist in our records.</Message>
<AWSAccessKeyId>ASIA********</AWSAccessKeyId>
I am able to successfully call GetObject on this s3 bucket from the CLI using the credentials returned from AssumeRole, so I can be reasonably sure that my bucket permissions have been set up fine. I have the following code in Apex:
Http http = new http();
Profile p = [SELECT Id FROM Profile WHERE Profile.Name = 'S3 Test User' LIMIT 1];
S3_Settings__c s3 = S3_Settings__c.getInstance(p.Id);
String exp = String.valueOf(Cache.Session.get('expiration'));
String sessionToken = String.valueOf(Cache.Session.get('token'));
if(exp == null || exp == '' || (DateTime) JSON.deserialize('"' + exp + '"', DateTime.class) < System.now()) {
requestSessionToken();
}
sessionToken = String.valueOf(Cache.Session.get('token'));
DateTime expires = (DateTime) JSON.deserialize('"' + String.valueOf(Cache.Session.get('expiration')) + '"', DateTime.class);
String accessKeyId = String.valueOf(Cache.Session.get('accessKeyId'));
String accessSecret = String.valueOf(Cache.Session.get('secret'));
String bucketname = s3.Recording_Bucket__c;
String host = 's3.amazonaws.com';
String formattedDateString = Datetime.now().formatGMT('EEE, dd MMM yyyy HH:mm:ss z');
String method = 'GET';
String filePath = 'https://' + bucketname + '.' + host + '/' + filename;
HttpRequest req = new HttpRequest();
req.setMethod(method);
req.setEndpoint(filePath);
req.setHeader('Host', bucketname + '.' + host);
req.setHeader('Connection', 'keep-alive');
String stringToSign = 'GET\n\n' + 'x-amz-security-token=' + sessionToken + '&expiration=' + expires + '\n' + formattedDateString + '\n/' + '/' + bucketname + '/' + filename;
System.debug('SIGN ' + stringToSign);
String encodedStringToSign = EncodingUtil.urlEncode(stringToSign, 'UTF-8');
Blob mac = Crypto.generateMac('HMACSHA1', blob.valueof(stringToSign),blob.valueof(accessSecret));
String signedKey = EncodingUtil.base64Encode(mac);
String authHeader = 'AWS' + ' ' + accessKeyId + ':' + signedKey;
req.setHeader('Date', formattedDateString);
//req.setHeader('x-amz-security-token', sessionToken); //AWS returns 'invalid signature' if this is set
req.setHeader('Authorization',authHeader);
HttpResponse resp = http.send(req);
It seems as if the AWS is reading in the AccessKeyId/Secret, but not the session token. I've also tried setting x-amz-security-token as a header, but this throws a 403 Error -- Signature mismatch. Am I missing something within my headers or signature that would enable this request to return successfully?
Turns out I was placing the x-amz-security-token header in the wrong location. It needs to occur in the canonical AMZ Headers section immediately after the formatted date, with a comma to separate the name and value:
String stringToSign = 'GET\n\n\n' + formattedDateString + '\n' + 'x-amz-security-token:' + sessionToken + '\n' + '/' + bucketname + '/' + filename;
Additionally, the following line needed to be uncommented:
req.setHeader('x-amz-security-token', sessionToken);
As a last note, be sure that neither the header nor the canonicalized AMZ Header are capitalized.

Katalon Studio - Cannot send the test request: Invalid start or end

I'm trying to send 'POST' web request to specific address (for example: https://qa.alycedev.com/)
Any properties set (header, body, cookies, url encode/decode) leads to error while sending my web request:
Warning > Cannot send the test request. Reason:
java.lang.IllegalArgumentException: invalid start or end.
Below you can find details of error:
https://i.stack.imgur.com/szGan.png
Example of my request:
#Keyword
def purchase_simple (def totalCookies) {
WebUI.comment('Total number of cookies=' +totalCookies.size())
RequestObject ro = new RequestObject('A')
ro.setRestRequestMethod('POST')
ro.setRestUrl('https://qa.alycedev.com/gifter/dashboard')
def httpheader = new ArrayList<TestObjectProperty>()
httpheader.add(new TestObjectProperty('Content-Type', ConditionType.EQUALS, 'application/json'))
httpheader.add(new TestObjectProperty('Accept', ConditionType.EQUALS, 'application/json'))
for (Cookie currentCookie : totalCookies) {
httpheader.add(new TestObjectProperty('Cookie', ConditionType.EQUALS, 'domain='+currentCookie.getDomain() + '; '+currentCookie.getName() + '=' + currentCookie.getValue() + '; expires='+currentCookie.getExpiry() + '; Max-Age=7200; path=/;'))
}
ro.setHttpHeaderProperties(httpheader)
def body = '{"_token": "' + GlobalVariable.G_api_oauth.csrfToken + '","use_credits": ' + GlobalVariable.G_product.price + ',"use_payment": 0.00,"currency_id": 1,"total_price": ' + GlobalVariable.G_product.price + ',"payment_data" : null,"gift_send_data": "{"id":' + GlobalVariable.G_gift.id + ',"product_id":' + GlobalVariable.G_product.id + ',"total_price":' + GlobalVariable.G_product.price + ',"send_now":true,"schedule_at":"","send_type":"hwcard","gifter_company":"Apple","giftee_company":"Sony","from":null,"gifter_address":{"country":{"id":1,"name":"United States","code":"US","image":""},"country_id":1,"address":"Address","address2":"Address 2","city":"City","state":"State","zipcode":"123321"},"giftee_address":{"country":{"id":1,"name":"United States","code":"US","image":""},"country_id":1,"address":"Address","address2":"Address 2","city":"City","state":"State","zipcode":"123321"},"message_to_giftee":{"id":1,"default":1,"message":"Messages subject + 10 characters.","subject":"Something for your time","name":"Something for your time"},"prospect":{"capture_date":true,"capture_email":true,"capture_phone":true,"capture_question":false,"capture_affidavit":false,"gifter_affidavit":"","gifter_question":""},"delivery_method_data":{"type":"branded_box"}}"}'
ro.setBodyContent(new HttpTextBodyContent(body, 'UTF-8', 'application/json'))
WebUI.comment(ro.activeProperties.toArray().toString())
try{
def response = WSBuiltInKeywords.sendRequest(ro)
}
catch(Exception ex) {
println(ex.detailMessage)
println(ex.stackTraceDepth.toString())
}
}

getting "415:Media not supported" Error when passing pdf to IBM watson in Salesforce

I am planning to integrate the IBM Watson Document Conversion service
with Salesforce.
From there I am unable to send my pdf file directly to Watson and I'm getting Media Type not supported.
I am also getting this error:
{
"code" : 500 ,
"error" : "Server Error" ,
"description" : "2017-07-18T06:02:19-04:00, Error WATSNGWERR-0x0113001c occurred when accessing https://gateway.watsonplatform.net/document-conversion/api/v1/convert_document?version=2015-12-15&config="{"conversion_target":"answer_units"}", Tran-Id: gateway-dp02-1967135880 - Watson Gateway Error"
}
Here is the code I'm using:
public class Resume {
String boundary = '----------------------------741e90d31eff';
public string id{get;set;}
public string content{get;set;}
Transient public Attachment att{set;get;}
public Resume(ApexPages.StandardController controller) {
id=ApexPages.currentPage().getParameters().get('id');
att=new Attachment();
att=[Select Id,ParentId, Name,body,ContentType From Attachment where ParentId=:id limit 1];
content=String.valueOf(att.body);
System.debug('---->' + content);
String header = '--' + boundary + '\nContent-Disposition: form-data; name="att"; filename="'+att.name+'";\nContent-Type: application/pdf';
String footer = '--' + boundary + '--';
String headerEncoded =
EncodingUtil.base64Encode(Blob.valueOf(header+'\r\n\r\n'));
String bodyEncoded = EncodingUtil.base64Encode(att.body);
Blob bodyBlob = null;
String last4Bytes =
bodyEncoded.substring(bodyEncoded.length() - 4, bodyEncoded.length());
while (headerEncoded.endsWith('=')){
header+=' ';
headerEncoded = EncodingUtil.base64Encode(Blob.valueOf(header+'\r\n\r\n'));
}
if (last4Bytes.endsWith('==')) {
last4Bytes = last4Bytes.substring(0,2) + '0K';
bodyEncoded = bodyEncoded.substring(0,bodyEncoded.length()-4) + last4Bytes;
String footerEncoded = EncodingUtil.base64Encode(Blob.valueOf(footer));
bodyBlob = EncodingUtil.base64Decode(headerEncoded + bodyEncoded + footerEncoded);
} else if (last4Bytes.endsWith('=')) {
last4Bytes = last4Bytes.substring(0,3) + 'N';
bodyEncoded = bodyEncoded.substring(0,bodyEncoded.length()-4) + last4Bytes;
footer = '\n' + footer;
String footerEncoded = EncodingUtil.base64Encode(Blob.valueOf(footer));
bodyBlob = EncodingUtil.base64Decode(headerEncoded + bodyEncoded + footerEncoded);
} else {
footer = '\r\n' + footer;
String footerEncoded = EncodingUtil.base64Encode(Blob.valueOf(footer));
bodyBlob = EncodingUtil.base64Decode(headerEncoded + bodyEncoded + footerEncoded);
}
String configAsString ='\"conversion_target:answer_units\"';
Http h = new Http();
HttpRequest request = new HttpRequest();
request.setMethod('POST');
request.setHeader('Content-Type','multipart/form-data; boundary=' + boundary);
String username= 'DOCUMENT-CONVERSION-USERNAME';
String password= 'DOCUMENT-CONVERSION-PASSWORD';
request.setHeader('Authorization', 'Basic ' + EncodingUtil.base64Encode(blob.valueOf(username + ':' + password)));
request.setEndpoint('https://gateway.watsonplatform.net/document-conversion/api/v1/convert_document?version=2015-12-15&config='+configAsString);
request.setBodyAsBlob(bodyBlob);
request.setCompressed(true);
HttpResponse response = h.send(request);
System.debug(response.getBody());
}
}
You are sending the config as a query parameter, but it should be in the body.
Here is the curl command that makes what you are trying to do:
curl -X POST \
-u "{username}":"{password}" \
-F config="{\"conversion_target\":\"answer_units\"}" \
-F "file=#sample.pdf;type=application/pdf" \
"https://gateway.watsonplatform.net/document-conversion/api/v1/convert_document?version=2015-12-15"
I also think there is an error in the way you are creating the body. My team built an SDK to use the Watson APIs in the Salesforce environment. I would suggest you take a look.
If you can't deploy the SDK to your Salesforce organization (It's a lot of code), copy the code from the IBMWatsonMultipartBody.cls class. It will help you encode an Attachment as base64 so that you can end it to Watson.
UPDATE: The Document Conversion service was deprecated but the features from that service were enhanced and migrated to the Discovery service