Why does Azure Data Factory seemingly insist on inserting DateTimes as string? - azure-sql-database

I'm trying to set up an Azure Data Factory to copy and denormalize my data from a AzureSQL database to another AzureSQL database for reporting/BI purposes with a data flow, but I ran into a problem with inserting dates.
This is the definition of my dataflow.
{
"name": "dataflow1",
"properties": {
"type": "MappingDataFlow",
"typeProperties": {
"sources": [
{
"dataset": {
"referenceName": "AzureSqlTable1",
"type": "DatasetReference"
},
"name": "source1"
}
],
"sinks": [
{
"dataset": {
"referenceName": "AzureSqlTable2",
"type": "DatasetReference"
},
"name": "sink1"
}
],
"script": "\n\nsource(output(\n\t\tBucketId as string,\n\t\tStreamId as string,\n\t\tStreamIdOriginal as string,\n\t\tStreamRevision as integer,\n\t\tItems as integer,\n\t\tCommitId as string,\n\t\tCommitSequence as integer,\n\t\tCommitStamp as timestamp,\n\t\tCheckpointNumber as long,\n\t\tDispatched as boolean,\n\t\tHeaders as binary,\n\t\tPayload as binary\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tisolationLevel: 'READ_UNCOMMITTED',\n\tformat: 'table') ~> source1\nsource1 sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tformat: 'table',\n\tdeletable:false,\n\tinsertable:true,\n\tupdateable:false,\n\tupsertable:false,\n\tmapColumn(\n\t\tBucketId,\n\t\tCommitStamp\n\t)) ~> sink1"
}
}
}
and these are the definitions of my source
{
"name": "AzureSqlTable1",
"properties": {
"linkedServiceName": {
"referenceName": "Source_Test",
"type": "LinkedServiceReference"
},
"annotations": [],
"type": "AzureSqlTable",
"schema": [
{
"name": "BucketId",
"type": "varchar"
},
{
"name": "StreamId",
"type": "char"
},
{
"name": "StreamIdOriginal",
"type": "nvarchar"
},
{
"name": "StreamRevision",
"type": "int",
"precision": 10
},
{
"name": "Items",
"type": "tinyint",
"precision": 3
},
{
"name": "CommitId",
"type": "uniqueidentifier"
},
{
"name": "CommitSequence",
"type": "int",
"precision": 10
},
{
"name": "CommitStamp",
"type": "datetime2",
"scale": 7
},
{
"name": "CheckpointNumber",
"type": "bigint",
"precision": 19
},
{
"name": "Dispatched",
"type": "bit"
},
{
"name": "Headers",
"type": "varbinary"
},
{
"name": "Payload",
"type": "varbinary"
}
],
"typeProperties": {
"tableName": "[dbo].[Commits]"
}
}
}
and sink data sets
{
"name": "AzureSqlTable2",
"properties": {
"linkedServiceName": {
"referenceName": "Dest_Test",
"type": "LinkedServiceReference"
},
"annotations": [],
"type": "AzureSqlTable",
"schema": [],
"typeProperties": {
"tableName": "dbo.Test2"
}
}
}
When running my pipeline with the data flow I get the following error:
Activity dataflow1 failed: DF-EXEC-1 Conversion failed when converting date and/or time from character string.
com.microsoft.sqlserver.jdbc.SQLServerException: Conversion failed when converting date and/or time from character string.
at com.microsoft.sqlserver.jdbc.SQLServerException.makeFromDatabaseError(SQLServerException.java:258)
at com.microsoft.sqlserver.jdbc.TDSTokenHandler.onEOF(tdsparser.java:256)
at com.microsoft.sqlserver.jdbc.TDSParser.parse(tdsparser.java:108)
at com.microsoft.sqlserver.jdbc.TDSParser.parse(tdsparser.java:28)
at com.microsoft.sqlserver.jdbc.SQLServerBulkCopy.doInsertBulk(SQLServerBulkCopy.java:1611)
at com.microsoft.sqlserver.jdbc.SQLServerBulkCopy.access$200(SQLServerBulkCopy.java:58)
at com.microsoft.sqlserver.jdbc.SQLServerBulkCopy$1InsertBulk.doExecute(SQLServerBulkCopy.java:709)
at com.microsoft.sqlserver.jdbc.TDSCommand.execute(IOBuffer.java:7151)
at com.microsoft.sqlserver.jdbc.SQLServerConnection.executeCommand(SQLServerConnection.java:2478)
at com.microsoft.sqlserver.jdbc.SQLServerBulkCopy.sendBulkLoadBCP(SQLServerBulkCopy.java:739)
at com.microsoft.sqlserver.jdbc.SQLServerBulkCopy.writeToServer(SQLServerBulkCopy.java:1684)
at com.microsoft.sqlserver.jdbc.SQLServerBulkCopy.writeToServer(SQLServerBulkCopy.java:669)
at com.microsoft.azure.sqldb.spark.connect.DataFrameFunctions.com$microsoft$azure$sqldb$spark$connect$DataFrameFunctions$$bulkCopy(DataFrameFunctions.scala:127)
at com.microsoft.azure.sqldb.spark.connect.DataFrameFunctions$$anonfun$bulkCopyToSqlDB$1.apply(DataFrameFunctions.scala:72)
at com.microsoft.azure.sqldb.spark.connect.DataFrameFunctions$$anonfun$bulkCopyToSqlDB$1.apply(DataFrameFunctions.scala:72)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:948)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:948)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2226)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2226)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:124)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:459)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1401)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:465)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
My Azure SQL audit log shows the following statement that failed (which is not a huge surprise considering that it uses VARCHAR(50) as type for [CommitStamp]:
INSERT BULK dbo.T_301fcb5e4a4148d4a48f2943011b2f04 (
[BucketId] NVARCHAR(MAX),
[CommitStamp] VARCHAR(50),
[StreamId] NVARCHAR(MAX),
[StreamIdOriginal] NVARCHAR(MAX),
[StreamRevision] INT,
[Items] INT,
[CommitId] NVARCHAR(MAX),
[CommitSequence] INT,
[CheckpointNumber] BIGINT,
[Dispatched] BIT,
[Headers] VARBINARY(MAX),
[Payload] VARBINARY(MAX),
[r8e440f7252bb401b9ead107597de6293] INT)
with (ROWS_PER_BATCH = 4096, TABLOCK)
I have absolutely no idea why this occurs. It looks like the schema information is correct but somehow it seems the data factory/data flow wants to insert the CommitStamp as a string type.
As requested, the output from the data flow/code/plan view:
source(output(
BucketId as string,
StreamId as string,
StreamIdOriginal as string,
StreamRevision as integer,
Items as integer,
CommitId as string,
CommitSequence as integer,
CommitStamp as timestamp,
CheckpointNumber as long,
Dispatched as boolean,
Headers as binary,
Payload as binary
),
allowSchemaDrift: true,
validateSchema: false,
isolationLevel: 'READ_UNCOMMITTED',
format: 'table',
schemaName: '[dbo]',
tableName: '[Commits]',
store: 'sqlserver',
server: 'sign2025-sqldata.database.windows.net',
database: 'SignPath.Application',
user: 'Sign2025Admin',
password: '**********') ~> source1
source1 sink(allowSchemaDrift: true,
validateSchema: false,
format: 'table',
deletable:false,
insertable:true,
updateable:false,
upsertable:false,
mapColumn(
BucketId,
CommitStamp
),
schemaName: 'dbo',
tableName: 'Test2',
store: 'sqlserver',
server: 'sign2025-sqldata.database.windows.net',
database: 'SignPath.Reporting',
user: 'Sign2025Admin',
password: '**********') ~> sink1

I created a data flow to copy data from an Azure SQL database to another Azure SQL database. It succeeded to covert datatime2 to VARCHAR(50).
This is the definition of my dataflow:
{
"name": "dataflow1",
"properties": {
"type": "MappingDataFlow",
"typeProperties": {
"sources": [
{
"dataset": {
"referenceName": "DestinationDataset_sto",
"type": "DatasetReference"
},
"name": "source1"
}
],
"sinks": [
{
"dataset": {
"referenceName": "DestinationDataset_mex",
"type": "DatasetReference"
},
"name": "sink1"
}
],
"script": "\n\nsource(output(\n\t\tID as integer,\n\t\ttName as string,\n\t\tmyTime as timestamp\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tisolationLevel: 'READ_UNCOMMITTED',\n\tformat: 'table') ~> source1\nsource1 sink(input(\n\t\tID as integer,\n\t\ttName as string,\n\t\tmyTime as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tformat: 'table',\n\tdeletable:false,\n\tinsertable:true,\n\tupdateable:false,\n\tupsertable:false) ~> sink1"
}
}
}
The definitions of my source:
{
"name": "DestinationDataset_sto",
"properties": {
"linkedServiceName": {
"referenceName": "AzureSqlDatabase1",
"type": "LinkedServiceReference"
},
"annotations": [],
"type": "AzureSqlTable",
"schema": [
{
"name": "ID",
"type": "int",
"precision": 10
},
{
"name": "tName",
"type": "varchar"
},
{
"name": "myTime",
"type": "datetime2",
"scale": 7
}
],
"typeProperties": {
"tableName": "[dbo].[demo]"
}
},
"type": "Microsoft.DataFactory/factories/datasets"
}
My sink settings:
{
"name": "DestinationDataset_mex",
"properties": {
"linkedServiceName": {
"referenceName": "AzureSqlDatabase1",
"type": "LinkedServiceReference"
},
"annotations": [],
"type": "AzureSqlTable",
"schema": [
{
"name": "ID",
"type": "int",
"precision": 10
},
{
"name": "tName",
"type": "varchar"
},
{
"name": "myTime",
"type": "varchar"
}
],
"typeProperties": {
"tableName": "[dbo].[demo1]"
}
},
"type": "Microsoft.DataFactory/factories/datasets"
}
Here are my data flow steps.
Step 1: Source settings:
Step 2: Sink settings:
Running succeeded:
The table demo and demo1 almost have the same schema except the myTime.
My source table and it's data:
My sink table and the data copied from demo:
Data Flow plan:
source(output(
ID as integer,
tName as string,
myTime as timestamp
),
allowSchemaDrift: true,
validateSchema: true,
isolationLevel: 'SERIALIZABLE',
format: 'table',
schemaName: '[dbo]',
tableName: '[demo]',
store: 'sqlserver',
server: '****.database.windows.net',
database: '****',
user: 'ServerAdmin',
password: '**********') ~> source1
source1 sink(input(
ID as integer,
tName as string,
myTime as string
),
allowSchemaDrift: true,
validateSchema: false,
format: 'table',
deletable:false,
insertable:true,
updateable:false,
upsertable:false,
schemaName: '[dbo]',
tableName: '[demo1]',
store: 'sqlserver',
server: '****.database.windows.net',
database: '****',
user: 'ServerAdmin',
password: '**********') ~> sink1
Update1:
I create the sink table manually and found that:
Data Flow can convert datatime2 to VARCHAR()(maybe NVARCHAR()) , date ,datetimeoffset.
When I try the date type time, datetime, datetime2, smalldatetime, Data Flow always gives the error:
"message": "DF-EXEC-1 Conversion failed when converting date and/or time from character
Update 2019-7-11:
I asked Azure Support for help and they replied me: this is a bug of Data Flow and there is no solution for now.
Update 2019-7-12:
I tested with Azure Support and they conform this is a bug. Here is the new email:
They also told me that the fix is already made and it will be deployed in next deployment train. This could be end of next week.
Hope this helps.

Looks like your Sink dataset defines myTime as a String:
sink(input(
ID as integer,
tName as string,
myTime as string
)
Can you change that to timestamp or Date, whichever you'd like to land it as?
Alternatively, you can land the data in a temporary staging table in SQL by setting "Recreate table" on the Sink and let ADF generate a new table definition on the fly using the data types of your mapped fields in the data flow.

Related

REST dataset for Copy Activity Source give me error Invalid PaginationRule

My Copy Activity is setup to use a REST Get API call as my source. I keep getting Error Code 2200 Invalid PaginationRule RuleKey=supportRFC5988.
I can call the GET Rest URL using the Web Activity, but this isn't optimal as I then have to pass the output to a stored procedure to load the data to the table. I would much rather use the Copy Activity.
Any ideas why I would get an Invalid PaginationRule error on a call?
I'm using a REST Linked Service with the following properties:
Name: Workday
Connect via integration runtime: link-unknown-self-hosted-ir
Base URL: https://wd2-impl-services1.workday.com/ccx/service
Authentication type: Basic
User name: Not telling
Azure Key Vault for password
Server Certificate Validation is enabled
Parameters: Name:format Type:String Default value:json
Datasource:
"name": "Workday_Test_REST_Report",
"properties": {
"linkedServiceName": {
"referenceName": "Workday",
"type": "LinkedServiceReference",
"parameters": {
"format": "json"
}
},
"folder": {
"name": "Workday"
},
"annotations": [],
"type": "RestResource",
"typeProperties": {
"relativeUrl": "/customreport2/company1/person%40company.com/HIDDEN_BI_RaaS_Test_Outbound"
},
"schema": []
}
}
Copy Activity
{
"name": "Copy Test Workday REST API output to a table",
"properties": {
"activities": [
{
"name": "Copy data1",
"type": "Copy",
"dependsOn": [],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"userProperties": [],
"typeProperties": {
"source": {
"type": "RestSource",
"httpRequestTimeout": "00:01:40",
"requestInterval": "00.00:00:00.010",
"requestMethod": "GET",
"paginationRules": {
"supportRFC5988": "true"
}
},
"sink": {
"type": "SqlMISink",
"tableOption": "autoCreate"
},
"enableStaging": false
},
"inputs": [
{
"referenceName": "Workday_Test_REST_Report",
"type": "DatasetReference"
}
],
"outputs": [
{
"referenceName": "Destination_db",
"type": "DatasetReference",
"parameters": {
"schema": "ELT",
"tableName": "WorkdayTestReportData"
}
}
]
}
],
"folder": {
"name": "Workday"
},
"annotations": []
}
}
Well after posting this, I noticed that in the copy activity code there is a nugget about "supportRFC5988": "true" I switched the true to false, and everything just worked for me. I don't see a way to change this in the Copy Activity GUI
Editing source code and setting this option to false helped!

Extract array from varchar in PrestoSQL

I have a VARCHAR field like this:
[
{
"config": 0,
"type": "0
},
{
"config": x,
"type": "1"
},
{
"config": "",
"type": ""
},
{
"config": [
{
"address": {},
"category": "",
"merchant": {
"data": [
10,12,23
],
"file": 0
},
"range_id": 1,
"shop_id_info": null
}
],
"type": "new"
}
]
And I need to extract merchant data from this. Desirable output is:
10
12
23
Please advise. I keep getting Cannot cast VARCHAR to array/unnest type VARCHAR
You can try using json path $.*.config.*.merchant.data.* but if it does not work for you (as for me in Athena version, where arrays in json path are not supported well) you can cast your json to ARRAY(JSON) and do some manipultaions from there (needed to fix your JSON a little bit):
Test data:
WITH dataset AS (
SELECT * FROM (VALUES
(JSON '[
{
"config": {},
"type": "0"
},
{
"config": "x",
"type": "1"
},
{
"config": "",
"type": ""
},
{
"config": [
{
"address": {},
"category": "",
"merchant": {
"data": [
10,12,23
],
"file": 0
},
"range_id": 1,
"shop_id_info": null
}
],
"type": "new"
}
]')
) AS t (json_value))
And query:
SELECT flatten(
transform(
flatten(
transform(
CAST(json_value AS ARRAY(JSON))
, json_object -> try(CAST(json_extract(json_object, '$.config') AS ARRAY(JSON))))),
json_config -> CAST(json_extract(json_config, '$.merchant.data') as ARRAY(INTEGER))))
FROM dataset
Which will give you array of numbers:
_col0
[10, 12, 23]
And from there you can continue with unnest and so on if needed.

Query Druid SQL inner join with a dataSource name that has a dash

How to write an INNER JOIN query between two data sources that one of them has a dash as it's schema name
Executing the following query on the Druid SQL binary results in a query error
SELECT *
FROM first
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;
org.apache.druid.java.util.common.ISE: Cannot build plan for query
Is this the correct syntax when trying to refrence a data source that has a dash in it's name?
Schema
[
{
"dataSchema": {
"dataSource": "second-schema",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"device_id",
"device_name",
"x_1",
"x_2",
"x_3",
"vlan",
"s_x",
"d_x",
"d_p",
"msg_type"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
{
"type": "count",
"name": "event_count"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "flow-info",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "flow-info"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
},
{
"dataSchema": {
"dataSource": "first",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"category",
"device_id",
"device_name",
"severity",
"x_2",
"x_3",
"x_4",
"x_5",
"vlan",
"s_x",
"d_x",
"s_i",
"d_i",
"d_p",
"id"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "doubleSum", "name": "val_num", "fieldName": "val_num" },
{ "type": "doubleMin", "name": "val_num_min", "fieldName": "val_num" },
{ "type": "doubleMax", "name": "val_num_max", "fieldName": "val_num" },
{ "type": "doubleSum", "name": "size", "fieldName": "size" },
{ "type": "doubleMin", "name": "size_min", "fieldName": "size" },
{ "type": "doubleMax", "name": "size_max", "fieldName": "size" },
{ "type": "count", "name": "first_count" }
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "first",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "first"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
}
]
Based on your schema definitions there are a few observations I'll make.
When doing a join you usually have to list out columns explicitly (not use a *) otherwise you get collisions from duplicate columns. In your join, for example, you have a device_id in both "first" and "second-schema", not to mention all the other columns that are the same across both.
When using a literal delimiter I don't mix them up. I either use them or I don't.
So I think your query will work better in the form of something more like this
SELECT
"first"."etid",
"first"."category",
"first"."device_id",
"first"."device_name",
"first"."severity",
"first"."x_2",
"first"."x_3",
"first"."x_4",
"first"."x_5",
"first"."vlan",
"first"."s_x",
"first"."d_x",
"first"."s_i",
"first"."d_i",
"first"."d_p",
"first"."id",
"second-schema"."etid" as "ss_etid",
"second-schema"."device_id" as "ss_device_id",
"second-schema"."device_name" as "ss_device_name",
"second-schema"."x_1" as "ss_x_1",
"second-schema"."x_2" as "ss_x_2",
"second-schema"."x_3" as "ss_x_3",
"second-schema"."vlan" as "ss_vlan",
"second-schema"."s_x" as "ss_s_x",
"second-schema"."d_x" as "ss_d_x",
"second-schema"."d_p" as "ss_d_p",
"second-schema"."msg_type"
FROM "first"
INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";
Obviously feel free to name columns as you see fit, or include exclude columns as needed. Select * will only work when all columns across both tables are unique.

Copy Activity Properties to update Azure DW data from an on prem SQL stored procedure in data factory

I'm not sure that what I'm trying to achieve is even possible in Data factory, but I guess there should be a way.
Simply put it, I have a table in DW that needs to be updated by a stored procedure once a day.
This stored procedure resides on the Source DB, I am looking for a way to pass some IDs and get the results from that SP and store it in DB.
Any Help would be appreciated. Below Pipeline is all I could think of:
{
"name": "UpdateColumnX",
"properties": {
"activities": [
{
"type": "SqlServerStoredProcedure?? Not Really Sure",
"typeProperties": {
"source": {
"type": "SqlSource",
"sqlReaderQuery": "$$Text.Format('Passing IDs to the stored Procedure', Time.AddHours(WindowStart,10), Time.AddHours(WindowEnd,10))\n"
},
"storedProcedureName": "UpdateDataThroughSP",
"storedProcedureParameters": {
"StartDate": "$$Text.Format('{0:yyyy-MM-dd HH:mm:ss}', Time.AddHours(WindowStart,10))",
"EndDate ": "$$Text.Format('{0:yyyy-MM-dd HH:mm:ss}', Time.AddHours(WindowEnd,10))"
}
},
"inputs": [
{
"name": "Not Sure which table should be my Input, the DW table having the IDs or the source table? "
}
],
"outputs": [
{
"name": "Sames and Input not sure"
}
],
"policy": {
"timeout": "01:00:00",
"concurrency": 1,
"retry": 3
},
"scheduler": {
"frequency": "Day",
"interval": 1,
"offset": "20:30:00"
},
"name": "Update Data through Source SP"
}
],
"start": "2017-09-13T20:30:00.045Z",
"end": "2099-12-30T13:00:00Z",
"isPaused": false,
"hubName": "HubName",
"pipelineMode": "Scheduled"
}
}

aws pipeline parameter error

I have created a pipeline to load data from S3 to RDS mysql instance.I can save the pipeline without any errors but on activation I get the error "No value specified for parameter 1". My online search so far has suggested that the insert statement parameters need to be defined somewhere. If this is correct then how to do so?
The following is the script generated in the process
{
"objects": [
{
"output": {
"ref": "DestinationRDSTable"
},
"input": {
"ref": "S3InputDataLocation"
},
"dependsOn": {
"ref": "RdsMySqlTableCreateActivity"
},
"name": "DataLoadActivity",
"id": "DataLoadActivity",
"runsOn": {
"ref": "Ec2Instance"
},
"type": "CopyActivity"
},
{
"*password": "#{*myRDSPassword}",
"name": "rds_mysql",
"jdbcProperties": "allowMultiQueries=true",
"id": "rds_mysql",
"type": "RdsDatabase",
"rdsInstanceId": "#{myRDSInstanceId}",
"username": "#{myRDSUsername}"
},
{
"instanceType": "t1.micro",
"name": "Ec2Instance",
"actionOnTaskFailure": "terminate",
"securityGroups": "#{myEc2RdsSecurityGrps}",
"id": "Ec2Instance",
"type": "Ec2Resource",
"terminateAfter": "2 Hours"
},
{
"database": {
"ref": "rds_mysql"
},
"name": "RdsMySqlTableCreateActivity",
"runsOn": {
"ref": "Ec2Instance"
},
"id": "RdsMySqlTableCreateActivity",
"type": "SqlActivity",
"script": "#{myRDSTableInsertSql}"
},
{
"database": {
"ref": "rds_mysql"
},
"name": "DestinationRDSTable",
"insertQuery": "#{myRDSTableInsertSql}",
"id": "DestinationRDSTable",
"type": "SqlDataNode",
"table": "#{myRDSTableName}",
"selectQuery": "select * from #{table}"
},
{
"escapeChar": "\\",
"name": "DataFormat1",
"columnSeparator": "|",
"id": "DataFormat1",
"type": "TSV",
"recordSeparator": "\\n"
},
{
"directoryPath": "#{myInputS3Loc}",
"dataFormat": {
"ref": "DataFormat1"
},
"name": "S3InputDataLocation",
"id": "S3InputDataLocation",
"type": "S3DataNode"
},
{
"failureAndRerunMode": "CASCADE",
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://logs3tords/",
"scheduleType": "ONDEMAND",
"name": "Default",
"id": "Default"
}
],
"parameters": [
{
"description": "RDS MySQL password",
"id": "*myRDSPassword",
"type": "String"
},
{
"watermark": "security group name",
"helpText": "The names of one or more EC2 security groups that have access to the RDS MySQL cluster.",
"description": "RDS MySQL security group(s)",
"isArray": "true",
"optional": "true",
"id": "myEc2RdsSecurityGrps",
"type": "String"
},
{
"description": "RDS MySQL username",
"id": "myRDSUsername",
"type": "String"
},
{
"description": "Input S3 file path",
"id": "myInputS3Loc",
"type": "AWS::S3::ObjectKey"
},
{
"helpText": "The SQL statement to insert data into the RDS MySQL table.",
"watermark": "INSERT INTO #{table} (col1, col2, col3) VALUES(?, ?, ?) ;",
"description": "Insert SQL query",
"id": "myRDSTableInsertSql",
"type": "String"
},
{
"helpText": "The name of an existing table or a new table that will be created based on the create table SQL query parameter below.",
"description": "RDS MySQL table name",
"id": "myRDSTableName",
"type": "String"
},
{
"watermark": "CREATE TABLE pet IF NOT EXISTS (name VARCHAR(20), owner VARCHAR(20), species VARCHAR(20), gender CHAR(1), birth DATE, death DATE);",
"helpText": "The idempotent SQL statement to create the RDS MySQL table if it does not already exist.",
"description": "Create table SQL query",
"optional": "true",
"id": "myRDSCreateTableSql",
"type": "String"
},
{
"watermark": "DB Instance",
"description": "RDS Instance ID",
"id": "myRDSInstanceId",
"type": "String"
}
],
"values": {
"myRDSInstanceId": "instance name",
"myRDSUsername": "user",
"myRDSTableInsertSql": "Insert into Ten.MD_ip_hp (ID, NAME, ADDRESS1, ADDRESS2, CITY, STATE, ZIP, DS ) VALUES(?,?,?,?,?,?,?,?);",
"*myRDSPassword": "password",
"myInputS3Loc": "log location",
"myRDSTableName": "MD_ip_hp"
}
}
UPDATE:
So I specified 'script argument' 1 to 8 on the sql activity node which resulted in my error to change to "No value specified for parameter 2". How to now read each number as a different parameter? >:x
Such a silly thing!
I was able to resolve it by creating separate script argument corresponding to each parameter in my query. In layman words, a script argument for each of the ? in my query.