Here are the details of this performance test (very simple). I'm trying to understand why running data flows in the cloud native Azure Data Factory environment (Spark) is so much slower than running data flows hosted in Azure SSIS IR. My results show that running in latest ADFv2 is over 4 times slower than running the exact same data flow in Azure SSIS (even with a warm IR cluster already warmed up from previous run). I like all the new features of the v2 data flows but it hardly seems worth the performance hit unless I'm completely missing something. Eventually I'll be adding more complex data flows but wanted to understand base performance behavior.
Source:
1GB CSV stored in blob storage
Destination:
Azure SQL Server Database (one table and truncated before each run)
When using control flow in ADFv2 using a simple CopyActivity (no data flow)
91 seconds
When using native SSIS package with data flow (Azure Feature Pack to pull from same blob storage) running Azure SSIS with 8 cores.
76 seconds
Pure ADF Cloud Pipeline using DataFlow with warm Azure IR (cached from previous run) 8 (+ 8 Driver cores) with default partitioning (Spark)
(includes 96 seconds cluster startup which is another thing I don't understand since the TTL is 30 minutes on the IR and it was just ran 10 minutes prior)
360 seconds
Pipeline (LandWithCopy)
{
"name": "LandWithCopy",
"properties": {
"activities": [
{
"name": "CopyData",
"type": "Copy",
"dependsOn": [],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"userProperties": [],
"typeProperties": {
"source": {
"type": "DelimitedTextSource",
"storeSettings": {
"type": "AzureBlobStorageReadSettings",
"recursive": true,
"wildcardFileName": "data.csv",
"enablePartitionDiscovery": false
},
"formatSettings": {
"type": "DelimitedTextReadSettings"
}
},
"sink": {
"type": "AzureSqlSink",
"preCopyScript": "TRUNCATE TABLE PatientAR",
"disableMetricsCollection": false
},
"enableStaging": false,
"translator": {
"type": "TabularTranslator",
"mappings": [
{
"source": {
"name": "RecordAction",
"type": "String"
},
"sink": {
"name": "RecordAction",
"type": "String"
}
},
{
"source": {
"name": "UniqueId",
"type": "String"
},
"sink": {
"name": "UniqueId",
"type": "String"
}
},
{
"source": {
"name": "Type",
"type": "String"
},
"sink": {
"name": "Type",
"type": "String"
}
},
{
"source": {
"name": "TypeDescription",
"type": "String"
},
"sink": {
"name": "TypeDescription",
"type": "String"
}
},
{
"source": {
"name": "PatientId",
"type": "String"
},
"sink": {
"name": "PatientId",
"type": "String"
}
},
{
"source": {
"name": "PatientVisitId",
"type": "String"
},
"sink": {
"name": "PatientVisitId",
"type": "String"
}
},
{
"source": {
"name": "VisitDateOfService",
"type": "String"
},
"sink": {
"name": "VisitDateOfService",
"type": "String"
}
},
{
"source": {
"name": "VisitDateOfEntry",
"type": "String"
},
"sink": {
"name": "VisitDateOfEntry",
"type": "String"
}
},
{
"source": {
"name": "DoctorId",
"type": "String"
},
"sink": {
"name": "DoctorId",
"type": "String"
}
},
{
"source": {
"name": "DoctorName",
"type": "String"
},
"sink": {
"name": "DoctorName",
"type": "String"
}
},
{
"source": {
"name": "FacilityId",
"type": "String"
},
"sink": {
"name": "FacilityId",
"type": "String"
}
},
{
"source": {
"name": "FacilityName",
"type": "String"
},
"sink": {
"name": "FacilityName",
"type": "String"
}
},
{
"source": {
"name": "CompanyName",
"type": "String"
},
"sink": {
"name": "CompanyName",
"type": "String"
}
},
{
"source": {
"name": "TicketNumber",
"type": "String"
},
"sink": {
"name": "TicketNumber",
"type": "String"
}
},
{
"source": {
"name": "TransactionDateOfEntry",
"type": "String"
},
"sink": {
"name": "TransactionDateOfEntry",
"type": "String"
}
},
{
"source": {
"name": "InternalCode",
"type": "String"
},
"sink": {
"name": "InternalCode",
"type": "String"
}
},
{
"source": {
"name": "ExternalCode",
"type": "String"
},
"sink": {
"name": "ExternalCode",
"type": "String"
}
},
{
"source": {
"name": "Description",
"type": "String"
},
"sink": {
"name": "Description",
"type": "String"
}
},
{
"source": {
"name": "Fee",
"type": "String"
},
"sink": {
"name": "Fee",
"type": "String"
}
},
{
"source": {
"name": "Units",
"type": "String"
},
"sink": {
"name": "Units",
"type": "String"
}
},
{
"source": {
"name": "AREffect",
"type": "String"
},
"sink": {
"name": "AREffect",
"type": "String"
}
},
{
"source": {
"name": "Action",
"type": "String"
},
"sink": {
"name": "Action",
"type": "String"
}
},
{
"source": {
"name": "InsuranceGroup",
"type": "String"
},
"sink": {
"name": "InsuranceGroup",
"type": "String"
}
},
{
"source": {
"name": "Payer",
"type": "String"
},
"sink": {
"name": "Payer",
"type": "String"
}
},
{
"source": {
"name": "PayerType",
"type": "String"
},
"sink": {
"name": "PayerType",
"type": "String"
}
},
{
"source": {
"name": "PatBalance",
"type": "String"
},
"sink": {
"name": "PatBalance",
"type": "String"
}
},
{
"source": {
"name": "InsBalance",
"type": "String"
},
"sink": {
"name": "InsBalance",
"type": "String"
}
},
{
"source": {
"name": "Charges",
"type": "String"
},
"sink": {
"name": "Charges",
"type": "String"
}
},
{
"source": {
"name": "Payments",
"type": "String"
},
"sink": {
"name": "Payments",
"type": "String"
}
},
{
"source": {
"name": "Adjustments",
"type": "String"
},
"sink": {
"name": "Adjustments",
"type": "String"
}
},
{
"source": {
"name": "TransferAmount",
"type": "String"
},
"sink": {
"name": "TransferAmount",
"type": "String"
}
},
{
"source": {
"name": "FiledAmount",
"type": "String"
},
"sink": {
"name": "FiledAmount",
"type": "String"
}
},
{
"source": {
"name": "CheckNumber",
"type": "String"
},
"sink": {
"name": "CheckNumber",
"type": "String"
}
},
{
"source": {
"name": "CheckDate",
"type": "String"
},
"sink": {
"name": "CheckDate",
"type": "String"
}
},
{
"source": {
"name": "Created",
"type": "String"
},
"sink": {
"name": "Created",
"type": "String"
}
},
{
"source": {
"name": "ClientTag",
"type": "String"
},
"sink": {
"name": "ClientTag",
"type": "String"
}
}
]
}
},
"inputs": [
{
"referenceName": "PAR_Source_DS",
"type": "DatasetReference"
}
],
"outputs": [
{
"referenceName": "PAR_Sink_DS",
"type": "DatasetReference"
}
]
}
],
"annotations": []
}
}
Pipeline Data Flow (LandWithFlow)
{
"name": "WriteData",
"properties": {
"type": "MappingDataFlow",
"typeProperties": {
"sources": [
{
"dataset": {
"referenceName": "PAR_Source_DS",
"type": "DatasetReference"
},
"name": "GetData"
}
],
"sinks": [
{
"dataset": {
"referenceName": "PAR_Sink_DS",
"type": "DatasetReference"
},
"name": "WriteData"
}
],
"transformations": [],
"script": "source(output(\n\t\tRecordAction as string,\n\t\tUniqueId as string,\n\t\tType as string,\n\t\tTypeDescription as string,\n\t\tPatientId as string,\n\t\tPatientVisitId as string,\n\t\tVisitDateOfService as string,\n\t\tVisitDateOfEntry as string,\n\t\tDoctorId as string,\n\t\tDoctorName as string,\n\t\tFacilityId as string,\n\t\tFacilityName as string,\n\t\tCompanyName as string,\n\t\tTicketNumber as string,\n\t\tTransactionDateOfEntry as string,\n\t\tInternalCode as string,\n\t\tExternalCode as string,\n\t\tDescription as string,\n\t\tFee as string,\n\t\tUnits as string,\n\t\tAREffect as string,\n\t\tAction as string,\n\t\tInsuranceGroup as string,\n\t\tPayer as string,\n\t\tPayerType as string,\n\t\tPatBalance as string,\n\t\tInsBalance as string,\n\t\tCharges as string,\n\t\tPayments as string,\n\t\tAdjustments as string,\n\t\tTransferAmount as string,\n\t\tFiledAmount as string,\n\t\tCheckNumber as string,\n\t\tCheckDate as string,\n\t\tCreated as string,\n\t\tClientTag as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\twildcardPaths:['data.csv']) ~> GetData\nGetData sink(input(\n\t\tRecordAction as string,\n\t\tUniqueId as string,\n\t\tType as string,\n\t\tTypeDescription as string,\n\t\tPatientId as string,\n\t\tPatientVisitId as string,\n\t\tVisitDateOfService as string,\n\t\tVisitDateOfEntry as string,\n\t\tDoctorId as string,\n\t\tDoctorName as string,\n\t\tFacilityId as string,\n\t\tFacilityName as string,\n\t\tCompanyName as string,\n\t\tTicketNumber as string,\n\t\tTransactionDateOfEntry as string,\n\t\tInternalCode as string,\n\t\tExternalCode as string,\n\t\tDescription as string,\n\t\tFee as string,\n\t\tUnits as string,\n\t\tAREffect as string,\n\t\tAction as string,\n\t\tInsuranceGroup as string,\n\t\tPayer as string,\n\t\tPayerType as string,\n\t\tPatBalance as string,\n\t\tInsBalance as string,\n\t\tCharges as string,\n\t\tPayments as string,\n\t\tAdjustments as string,\n\t\tTransferAmount as string,\n\t\tFiledAmount as string,\n\t\tCheckNumber as string,\n\t\tCheckDate as string,\n\t\tCreated as string,\n\t\tClientTag as string,\n\t\tFileName as string,\n\t\tPractice as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tdeletable:false,\n\tinsertable:true,\n\tupdateable:false,\n\tupsertable:false,\n\tformat: 'table',\n\tpreSQLs:['TRUNCATE TABLE PatientAR'],\n\tmapColumn(\n\t\tRecordAction,\n\t\tUniqueId,\n\t\tType,\n\t\tTypeDescription,\n\t\tPatientId,\n\t\tPatientVisitId,\n\t\tVisitDateOfService,\n\t\tVisitDateOfEntry,\n\t\tDoctorId,\n\t\tDoctorName,\n\t\tFacilityId,\n\t\tFacilityName,\n\t\tCompanyName,\n\t\tTicketNumber,\n\t\tTransactionDateOfEntry,\n\t\tInternalCode,\n\t\tExternalCode,\n\t\tDescription,\n\t\tFee,\n\t\tUnits,\n\t\tAREffect,\n\t\tAction,\n\t\tInsuranceGroup,\n\t\tPayer,\n\t\tPayerType,\n\t\tPatBalance,\n\t\tInsBalance,\n\t\tCharges,\n\t\tPayments,\n\t\tAdjustments,\n\t\tTransferAmount,\n\t\tFiledAmount,\n\t\tCheckNumber,\n\t\tCheckDate,\n\t\tCreated,\n\t\tClientTag\n\t),\n\tskipDuplicateMapInputs: true,\n\tskipDuplicateMapOutputs: true) ~> WriteData"
}
}
}
We are having the Same issues. Copy Activity without Data Flow is much faster than Data Flow. Our case is Copy Activity vs Data Flow. Not Sure if I'm doing anything wrong.
Our Scenario is just copy from Source to Destination 13 tables based on where Clause. We now have two copy activity which takes 1.5 minutes. So I was thinking may be create Data Flow and do one Source two Sinks. But it's running like 5 minutes to 8 Minutes depending on Cluster startup time. Hope we get an answer.
Depending on the salaryRange the user selects I need to validate differently by requiring some fields and rejecting others. I feel like its a combination of allOf and not but I can't seem to quite get it.
Scenario #1
User selects salaryRange(Hourly)
Require hourlyRate
Prevent the submission of fields feeOne and feeTwo
Scenario #2
User selects salaryRange(0-50k OR 50-100k)
Require feeOne and feeTwo
Prevent the submission of field hourlyRate
Here is my schema
{
"schema": "http://json-schema.org/draft-04/schema#",
"$id": "http://mysite/schemas/job.json#",
"title": "Job",
"description": "Create job",
"type": "object",
"properties": {
"title": { "type": "string" },
"description": { "type": "string" },
"salaryRange": { "enum": ["0-50k", "50-100k", "100-150k", "150-200k", "200-300k", "300k+", "nonExempt", "Hourly"] },
"hourlyRate": {
"type": "number",
"minimum": 0,
"maximum": 300
},
"feeOne": {
"type": "number",
"minimum": 0
},
"feeTwo": {
"type": "number",
"minimum": 0
}
} ,
"additionalProperties": false,
"required": [
"title",
"description",
"salaryRange"
]
}
You can use oneOf and not required to model all possible combinations.
Here is an example in js:
https://runkit.com/embed/cf8cra1mwvx3/
{
"schema": "http://json-schema.org/draft-04/schema#",
"$id": "http://mysite/schemas/job.json#",
"title": "Job",
"description": "Create job",
"type": "object",
"properties": {
"title": { "type": "string" },
"description": { "type": "string" },
"salaryRange": { "enum": ["0-50k", "50-100k", "100-150k", "150-200k", "200-300k", "300k+", "nonExempt", "Hourly"] },
"hourlyRate": {
"type": "number",
"minimum": 0,
"maximum": 300
},
"feeOne": {
"type": "number",
"minimum": 0
},
"feeTwo": {
"type": "number",
"minimum": 0
}
},
"oneOf": [
{
"description": "Disallow fees for hourly salary",
"properties": {
"salaryRange": { "enum": ["Hourly"] }
},
"required": ["hourlyRate"],
"allOf": [
{"not":{"required":["feeOne"]}},
{"not":{"required":["feeTwo"]}}
]
},
{
"description": "Disallow hourly rate for 0-50k, 50-100k salaries",
"properties": {
"salaryRange": { "enum": ["0-50k", "50-100k"] }
},
"required": ["feeOne", "feeTwo"],
"not":{"required":["hourlyRate"]}
},
{
"description": "Allow other cases",
"properties": {
"salaryRange": { "not" : {"enum": ["Hourly", "0-50k", "50-100k"] } }
}
}
],
"additionalProperties": false,
"required": [
"title",
"description",
"salaryRange"
]
}
I'm helping to build an interface that works with Json Schema, and I have a question about interface generation based on that schema. There are two display types - one for internal users and one for external users. Both are dealing with the same data, but the external users should see a smaller subset of fields than the internal users.
For example, here is one schema, it defines an obituary:
{
"$schema": "http://json-schema.org/draft-04/schema#",
"description": "",
"type": "object",
"required": [
"id",
"deceased"
],
"properties": {
"id": { "type": "string" },
"account": {
"type": "object",
"required": [
"name"
],
"properties": {
"id": { "type": "number" },
"name": { "type": "string" },
"website": {
"anyOf": [
{
"type": "string",
"format": "uri"
},
{
"type": "string",
"maxLength": 0
}
]
},
"email": {
"anyOf": [
{
"type": "string",
"format": "email"
},
{
"type": "string",
"maxLength": 0
}
]
},
"address": {
"type": "object",
"properties": {
"address1": { "type": "string" },
"address2": { "type": "string" },
"city": { "type": "string" },
"state": { "type": "string" },
"postalCode": { "type": "string" },
"country": { "type": "string" }
}
},
"phoneNumber": {
"anyOf": [
{
"type": "string",
"format": "phone"
},
{
"type": "string",
"maxLength": 0
}
]
},
"faxNumber": {
"anyOf": [
{
"type": "string",
"format": "phone"
},
{
"type": "string",
"maxLength": 0
}
]
},
"type": { "type": "string" }
}
},
"deceased": {
"type": "object",
"required": [
"fullName"
],
"properties": {
"fullName": { "type": "string" },
"prefix": { "type": "string" },
"firstName": { "type": "string" },
"middleName": { "type": "string" },
"nickName": { "type": "string" },
"lastName1": { "type": "string" },
"lastName2": { "type": "string" },
"maidenName": { "type": "string" },
"suffix": { "type": "string" }
}
},
"description": { "type": "string" },
"photos": {
"type": "array",
"items": { "type": "string" }
}
}
}
Internal users would be able to access all the fields, but external users shouldn't be able to read/write the account fields.
Should I make a second schema for the external users, or is there a way to indicate different display levels or public/private on each field?
You cannot restrict acess to the fields defined in a schema, but you can have 2 schema files, one defining the "public" fields, and the other one defining the restricted fields plus including the restricted fields.
So
public-schema.json:
{
"properties" : {
"id" : ...
}
}
restricted-schema.json:
{
"allOf" : [
{
"$ref" : "./public-schema.json"
},
{
"properties" : {
"account": ...
}
}
]
}
I'm trying to create a BigQuery table using Python. Other operations (queries, retrieving table bodies etc.) are working fine, but when trying to create a table I'm stuck with an error:
apiclient.errors.HttpError: https://www.googleapis.com/bigquery/v2/projects/marechal-consolidation/datasets/marechal_results/tables?alt=json
returned "Output field used as input">
Here's the command I'm executing:
projectId = 'xxxx'
dataSet = 'marechal_results'
with open(filePath+'tableStructure.json') as data_file:
structure = json.load(data_file)
table_result = tables.insert(projectId=projectId, datasetId=dataSet, body=structure).execute()
JSON table:
{
"kind": "bigquery#table",
"tableReference": {
"projectId": "xxxx",
"tableId": "xxxx",
"datasetId": "xxxx"
},
"type": "table",
"schema": {
"fields": [
{
"mode": "REQUIRED",
"type": "STRING",
"description": "Company",
"name": "COMPANY"
},
{
"mode": "REQUIRED",
"type": "STRING",
"description": "Currency",
"name": "CURRENCY"
}
// bunch of other fields follow...
]
}
}
Why am I receiving this error?
EDIT: Here's the JSON object I'm passing as parameter:
{
"kind": "bigquery#table",
"type": "TABLE",
"tableReference": {
"projectId": "xxxx",
"tableId": "xxxx",
"datasetId": "xxxx"
},
"schema": {
"fields": [
{
"type": "STRING",
"name": "COMPANY"
},
{
"type": "STRING",
"name": "YEAR"
},
{
"type": "STRING",
"name": "COUNTRY_ISO"
},
{
"type": "STRING",
"name": "COUNTRY"
},
{
"type": "STRING",
"name": "COUNTRY_GROUP"
},
{
"type": "STRING",
"name": "REGION"
},
{
"type": "STRING",
"name": "AREA"
},
{
"type": "STRING",
"name": "BU"
},
{
"type": "STRING",
"name": "REFERENCE"
},
{
"type": "FLOAT",
"name": "QUANTITY"
},
{
"type": "FLOAT",
"name": "NET_SALES"
},
{
"type": "FLOAT",
"name": "GROSS_SALES"
},
{
"type": "STRING",
"name": "FAM_GRP"
},
{
"type": "STRING",
"name": "FAMILY"
},
{
"type": "STRING",
"name": "PRESENTATION"
},
{
"type": "STRING",
"name": "ORIG_FAMILY"
},
{
"type": "FLOAT",
"name": "REF_PRICE"
},
{
"type": "STRING",
"name": "CODE1"
},
{
"type": "STRING",
"name": "CODE4"
}
]
}
}
This is probably too late to help you but hopefully it helps the next poor soul like me. It took me a while figure out what "Output field used as input" meant.
Though the API specifies the same object for the request (input) and response (output), some fields are only allowed in the response. In the docs you will see their descriptions prefixed with "Output only". From looking at your table definition I see that you have "type": "TABLE" and "type" is listed as an "Output only" property. So I would gander that if you remove it then that error will go away. Here is the link to the docs: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables
It would help if they told you what field the violation was on.