Upload bulk csv data into existing DynamoDB table - amazon-s3

I'm trying to migrate data from a csv file into an existing AWS DynamoDB table, as part of an AWS Amplify web app.
I followed this CloudFormation tutorial, using the below template.
I was only able to create a new DynamoDB table, but not use an existing table and add data to it.
QUESTION:
Is there a way to modify the template so that I can provide an existing table name at the "Specify stack details" step in the wizard, under "DynamoDBTableName", so that the csv data will be added to the table? If not, is there an alternative process?
{
"AWSTemplateFormatVersion": "2010-09-09",
"Metadata": {
},
"Parameters" : {
"BucketName": {
"Description": "Name of the S3 bucket you will deploy the CSV file to",
"Type": "String",
"ConstraintDescription": "must be a valid bucket name."
},
"FileName": {
"Description": "Name of the S3 file (including suffix)",
"Type": "String",
"ConstraintDescription": "Valid S3 file name."
},
"DynamoDBTableName": {
"Description": "Name of the dynamoDB table you will use",
"Type": "String",
"ConstraintDescription": "must be a valid dynamoDB name."
}
},
"Resources": {
"DynamoDBTable":{
"Type": "AWS::DynamoDB::Table",
"Properties":{
"TableName": {"Ref" : "DynamoDBTableName"},
"BillingMode": "PAY_PER_REQUEST",
"AttributeDefinitions":[
{
"AttributeName": "id",
"AttributeType": "S"
}
],
"KeySchema":[
{
"AttributeName": "id",
"KeyType": "HASH"
}
],
"Tags":[
{
"Key": "Name",
"Value": {"Ref" : "DynamoDBTableName"}
}
]
}
},
"LambdaRole" : {
"Type" : "AWS::IAM::Role",
"Properties" : {
"AssumeRolePolicyDocument": {
"Version" : "2012-10-17",
"Statement" : [
{
"Effect" : "Allow",
"Principal" : {
"Service" : ["lambda.amazonaws.com","s3.amazonaws.com"]
},
"Action" : [
"sts:AssumeRole"
]
}
]
},
"Path" : "/",
"ManagedPolicyArns":["arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole","arn:aws:iam::aws:policy/AWSLambdaInvocation-DynamoDB","arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"],
"Policies": [{
"PolicyName": "policyname",
"PolicyDocument": {
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Resource": "*",
"Action": [
"dynamodb:PutItem",
"dynamodb:BatchWriteItem"
]
}]
}
}]
}
},
"CsvToDDBLambdaFunction": {
"Type": "AWS::Lambda::Function",
"Properties": {
"Handler": "index.lambda_handler",
"Role": {
"Fn::GetAtt": [
"LambdaRole",
"Arn"
]
},
"Code": {
"ZipFile": {
"Fn::Join": [
"\n",
[
"import json",
"import boto3",
"import os",
"import csv",
"import codecs",
"import sys",
"",
"s3 = boto3.resource('s3')",
"dynamodb = boto3.resource('dynamodb')",
"",
"bucket = os.environ['bucket']",
"key = os.environ['key']",
"tableName = os.environ['table']",
"",
"def lambda_handler(event, context):",
"",
"",
" #get() does not store in memory",
" try:",
" obj = s3.Object(bucket, key).get()['Body']",
" except:",
" print(\"S3 Object could not be opened. Check environment variable. \")",
" try:",
" table = dynamodb.Table(tableName)",
" except:",
" print(\"Error loading DynamoDB table. Check if table was created correctly and environment variable.\")",
"",
" batch_size = 100",
" batch = []",
"",
" #DictReader is a generator; not stored in memory",
" for row in csv.DictReader(codecs.getreader('utf-8-sig')(obj)):",
" if len(batch) >= batch_size:",
" write_to_dynamo(batch)",
" batch.clear()",
"",
" batch.append(row)",
"",
" if batch:",
" write_to_dynamo(batch)",
"",
" return {",
" 'statusCode': 200,",
" 'body': json.dumps('Uploaded to DynamoDB Table')",
" }",
"",
"",
"def write_to_dynamo(rows):",
" try:",
" table = dynamodb.Table(tableName)",
" except:",
" print(\"Error loading DynamoDB table. Check if table was created correctly and environment variable.\")",
"",
" try:",
" with table.batch_writer() as batch:",
" for i in range(len(rows)):",
" batch.put_item(",
" Item=rows[i]",
" )",
" except:",
" print(\"Error executing batch_writer\")"
]
]
}
},
"Runtime": "python3.7",
"Timeout": 900,
"MemorySize": 3008,
"Environment" : {
"Variables" : {"bucket" : { "Ref" : "BucketName" }, "key" : { "Ref" : "FileName" },"table" : { "Ref" : "DynamoDBTableName" }}
}
}
},
"S3Bucket": {
"DependsOn" : ["CsvToDDBLambdaFunction","BucketPermission"],
"Type": "AWS::S3::Bucket",
"Properties": {
"BucketName": {"Ref" : "BucketName"},
"AccessControl": "BucketOwnerFullControl",
"NotificationConfiguration":{
"LambdaConfigurations":[
{
"Event":"s3:ObjectCreated:*",
"Function":{
"Fn::GetAtt": [
"CsvToDDBLambdaFunction",
"Arn"
]
}
}
]
}
}
},
"BucketPermission":{
"Type": "AWS::Lambda::Permission",
"Properties":{
"Action": "lambda:InvokeFunction",
"FunctionName":{"Ref" : "CsvToDDBLambdaFunction"},
"Principal": "s3.amazonaws.com",
"SourceAccount": {"Ref":"AWS::AccountId"}
}
}
},
"Outputs" : {
}
}
Another answer
Dennis' answer is one solution, but you can also comment out the "DynamoDBTable" part within the "Resources" of the JSON file.

You can migrate CSV files from Amazon S3 to Amazon DynamoDB using the AWS Database Migration Service (DMS). Have a look at this step-by step walkthrough.

Related

How is S3 bucket name being derived in CloudFormation?

I've this cloudformation script template.js that creates a bucket. I'm bit unsure how the bucket name is being assembled.
Assuming my stackname is my-service I'm getting bucket name created as my-service-s3bucket-1p3s4szy5bomf
I want to know how this name was derived
I also want to get rid of that arn at the end. -1p3s4szy5bomf
Can I skip Outputs at the end, Not sure what they do
Code in template.js
var stackTemplate = {
"AWSTemplateFormatVersion": "2010-09-09",
"Description": "with S3",
"Resources": {
"S3Bucket": {
"Type": "AWS::S3::Bucket",
"DeletionPolicy": "Retain",
"Properties": {},
"Metadata": {
"AWS::CloudFormation::Designer": {
"id": "bba483af-4ae6-4d3d-b37d-435f66c42e44"
}
}
},
"S3BucketAccessPolicy": {
"Type": "AWS::IAM::Policy",
"Properties": {
"PolicyName": "S3BucketAccessPolicy",
"Roles": [
{
"Ref": "IAMServiceRole"
}
],
"PolicyDocument": {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:DeleteObject",
"s3:GetObject",
"s3:PutObject",
"s3:PutObjectAcl",
"s3:List*"
],
"Resource": [
{
"Fn::Sub": [
"${S3BucketArn}",
{
"S3BucketArn": {
"Fn::GetAtt": ["S3Bucket", "Arn"]
}
}
]
},
{
"Fn::Sub": [
"${S3BucketArn}/*",
{
"S3BucketArn": {
"Fn::GetAtt": ["S3Bucket", "Arn"]
}
}
]
}
]
}
]
}
}
}
},
"Outputs": {
"s3Bucket": {
"Description": "The created S3 bucket.",
"Value": {
"Ref": "S3Bucket"
},
"Export": {
"Name": {
"Fn::Sub": "${AWS::StackName}-S3Bucket"
}
}
},
"s3BucketArn": {
"Description": "The ARN of the created S3 bucket.",
"Value": {
"Fn::GetAtt": ["S3Bucket", "Arn"]
},
"Export": {
"Name": {
"Fn::Sub": "${AWS::StackName}-S3BucketArn"
}
}
}
}
};
stackUtils.assembleStackTemplate(stackTemplate, module);
I want to know how this name was derived
If you don't specify a name for your bucket, CloudFormation generate a new one based on the pattern $name-of-stack-s3bucket-$generatedId
from documentation https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-s3-bucket.html
BucketName
A name for the bucket. If you don't specify a name, AWS CloudFormation generates a unique ID and uses that ID for the bucket name.
I also want to get rid of that arn at the end. -1p3s4szy5bomf
You can assign a name of you bucket, but AWS recommand to let it empty to generate a new one, to avoid creation with the same name (stackset...) by CloudFormation example :
"Resources": {
"S3Bucket": {
"Type": "AWS::S3::Bucket",
"DeletionPolicy": "Retain",
"Properties": {
"BucketName": "DesiredNameOfBucket" <==
},
"Metadata": {
"AWS::CloudFormation::Designer": {
"id": "bba483af-4ae6-4d3d-b37d-435f66c42e44"
}
}
},
Can I skip Outputs at the end, Not sure what they do
It is used to have the information, name and the ARN of the bucket created, if you want you can delete the Outputs part from your template

Set Subnet ID and EC2 Key Name in EMR Cluster Config via Step Functions

As of November 2019 AWS Step Function has native support for orchestrating EMR Clusters. Hence we are trying to configure a Cluster and run some jobs on it.
We could not find any documentation on how to set the SubnetId as well as the Key Name used for the EC2 instances in the cluster. Is there any such possibility?
As of now our create cluster step looks as following:
"States": {
"Create an EMR cluster": {
"Type": "Task",
"Resource": "arn:aws:states:::elasticmapreduce:createCluster.sync",
"Parameters": {
"Name": "TestCluster",
"VisibleToAllUsers": true,
"ReleaseLabel": "emr-5.26.0",
"Applications": [
{ "Name": "spark" }
],
"ServiceRole": "SomeRole",
"JobFlowRole": "SomeInstanceProfile",
"LogUri": "s3://some-logs-bucket/logs",
"Instances": {
"KeepJobFlowAliveWhenNoSteps": true,
"InstanceFleets": [
{
"Name": "MasterFleet",
"InstanceFleetType": "MASTER",
"TargetOnDemandCapacity": 1,
"InstanceTypeConfigs": [
{
"InstanceType": "m3.2xlarge"
}
]
},
{
"Name": "CoreFleet",
"InstanceFleetType": "CORE",
"TargetSpotCapacity": 2,
"InstanceTypeConfigs": [
{
"InstanceType": "m3.2xlarge",
"BidPriceAsPercentageOfOnDemandPrice": 100 }
]
}
]
}
},
"ResultPath": "$.cluster",
"End": "true"
}
}
As soon as we try to add "SubnetId" key in any of the subobjects in Parameters, or in Parameter itself we get the error:
Invalid State Machine Definition: 'SCHEMA_VALIDATION_FAILED: The field "SubnetId" is not supported by Step Functions at /States/Create an EMR cluster/Parameters' (Service: AWSStepFunctions; Status Code: 400; Error Code: InvalidDefinition;
Referring to the SF docs on the emr integration we can see that createCluster.sync uses the emr API RunJobFlow. In RunJobFlow we can specify the Ec2KeyName and Ec2SubnetId located at the paths $.Instances.Ec2KeyName and $.Instances.Ec2SubnetId.
With that said I managed to create a State Machine with the following definition (on a side note, your definition had a syntax error with "End": "true", which should be "End": true)
{
"Comment": "A Hello World example of the Amazon States Language using Pass states",
"StartAt": "Create an EMR cluster",
"States": {
"Create an EMR cluster": {
"Type": "Task",
"Resource": "arn:aws:states:::elasticmapreduce:createCluster.sync",
"Parameters": {
"Name": "TestCluster",
"VisibleToAllUsers": true,
"ReleaseLabel": "emr-5.26.0",
"Applications": [
{
"Name": "spark"
}
],
"ServiceRole": "SomeRole",
"JobFlowRole": "SomeInstanceProfile",
"LogUri": "s3://some-logs-bucket/logs",
"Instances": {
"Ec2KeyName": "ENTER_EC2KEYNAME_HERE",
"Ec2SubnetId": "ENTER_EC2SUBNETID_HERE",
"KeepJobFlowAliveWhenNoSteps": true,
"InstanceFleets": [
{
"Name": "MasterFleet",
"InstanceFleetType": "MASTER",
"TargetOnDemandCapacity": 1,
"InstanceTypeConfigs": [
{
"InstanceType": "m3.2xlarge"
}
]
},
{
"Name": "CoreFleet",
"InstanceFleetType": "CORE",
"TargetSpotCapacity": 2,
"InstanceTypeConfigs": [
{
"InstanceType": "m3.2xlarge",
"BidPriceAsPercentageOfOnDemandPrice": 100
}
]
}
]
}
},
"ResultPath": "$.cluster",
"End": true
}
}
}

SSMS Tabular Model: create multiple partition via one single xmla script

I need to create a considerable number of partitions using XMLA scripts run through SSMS.
Is there a way to combine the creation of multiple partitions into one single script.
In below example, I need to execute the first script. When it finishes, I can open the other script and execute it as well. Very time consuming.
How can I restructure the code to run it in only one execution?
Script 1:
"createOrReplace": {
"object": {
"database": "MYDB",
"table": "MYTABLE1", "partition": "Partition"
},
"partition": {
"name": "Process_OLD", "dataView": "full",
"source": {
"type": "m",
"expression": [
"let",
" Source = #\"mySQL/MY_SCHEMA\",", " MY_SCHEMA= Source{[Schema=\"MY_SCHEMA\"]}[Data],",
" AllData = MY_SCHEMA{[Name=\"MYTABLE1\"]}[Data],", "\t#\"Filtered Rows\" = Table.SelectRows(AllData, each [DATE] < 20170101)",
"in",
" #\"Filtered Rows\""
]
}
}
}
}
Script 2:
"createOrReplace": {
"object": {
"database": "MYDB",
"table": "MYTABLE2", "partition": "Partition"
},
"partition": {
"name": "Process_NEW", "dataView": "full",
"source": {
"type": "m",
"expression": [
"let",
" Source = #\"mySQL/MY_SCHEMA\",", " MY_SCHEMA= Source{[Schema=\"MY_SCHEMA\"]}[Data],",
" AllData = MY_SCHEMA{[Name=\"MYTABLE1\"]}[Data],", "\t#\"Filtered Rows\" = Table.SelectRows(AllData, each [DATE] >= 20170101)",
"in",
" #\"Filtered Rows\""
]
}
}
}
}
You can put a sequence command around it:
{
"sequence": {
"operations": [
{
"createOrReplace": {
"object": {
"database": "MYDB",
"table": "MYTABLE1",
"partition": "Partition"
},
"partition": {
"name": "Process_OLD",
"dataView": "full",
"source": {
"type": "m",
"expression": [
"let",
" Source = #\"mySQL/MY_SCHEMA\",",
" MY_SCHEMA= Source{[Schema=\"MY_SCHEMA\"]}[Data],",
" AllData = MY_SCHEMA{[Name=\"MYTABLE1\"]}[Data],",
"\t#\"Filtered Rows\" = Table.SelectRows(AllData, each [DATE] < 20170101)",
"in",
" #\"Filtered Rows\""
]
}
}
}
},
{
"createOrReplace": {
"object": {
"database": "MYDB",
"table": "MYTABLE2",
"partition": "Partition"
},
"partition": {
"name": "Process_NEW",
"dataView": "full",
"source": {
"type": "m",
"expression": [
"let",
" Source = #\"mySQL/MY_SCHEMA\",",
" MY_SCHEMA= Source{[Schema=\"MY_SCHEMA\"]}[Data],",
" AllData = MY_SCHEMA{[Name=\"MYTABLE1\"]}[Data],",
"\t#\"Filtered Rows\" = Table.SelectRows(AllData, each [DATE] >= 20170101)",
"in",
" #\"Filtered Rows\""
]
}
}
}
}
]
}
}

Cloudformation S3bucket creation

Here's the cloudformation template I wrote to create a simple S3 bucket, How do I specify the name of the bucket? Is this the right way?
{
"AWSTemplateFormatVersion": "2010-09-09",
"Description": "Simple S3 Bucket",
"Parameters": {
"OwnerService": {
"Type": "String",
"Default": "CloudOps",
"Description": "Owner or service name. Used to identify the owner of the vpc stack"
},
"ProductCode": {
"Type": "String",
"Default": "cloudops",
"Description": "Lowercase version of the product code (i.e. jem). Used for tagging"
},
"StackEnvironment": {
"Type": "String",
"Default": "stage",
"Description": "Lowercase version of the environment name (i.e. stage). Used for tagging"
}
},
"Mappings": {
"RegionMap": {
"us-east-1": {
"ShortRegion": "ue1"
},
"us-west-1": {
"ShortRegion": "uw1"
},
"us-west-2": {
"ShortRegion": "uw2"
},
"eu-west-1": {
"ShortRegion": "ew1"
},
"ap-southeast-1": {
"ShortRegion": "as1"
},
"ap-northeast-1": {
"ShortRegion": "an1"
},
"ap-northeast-2": {
"ShortRegion": "an2"
}
}
},
"Resources": {
"JenkinsBuildBucket": {
"Type": "AWS::S3::Bucket",
"Properties": {
"BucketName": {
"Fn::Join": [
"-",
[
{
"Ref": "ProductCode"
},
{
"Ref": "StackEnvironment"
},
"deployment",
{
"Fn::FindInMap": [
"RegionMap",
{
"Ref": "AWS::Region"
},
"ShortRegion"
]
}
]
]
},
"AccessControl": "Private"
},
"DeletionPolicy": "Delete"
}
},
"Outputs": {
"DeploymentBucket": {
"Description": "Bucket Containing Chef files",
"Value": {
"Ref": "DeploymentBucket"
}
}
}
}
Here's a really simple Cloudformation template that creates an S3 bucket, including defining the bucketname.
AWSTemplateFormatVersion: '2010-09-09'
Description: create a single S3 bucket
Resources:
SampleBucket:
Type: AWS::S3::Bucket
Properties:
BucketName: sample-bucket-0827-cc
You can also leave the "Properties: BucketName" lines off if you want AWS to name the bucket for you. Then it will look like $StackName-SampleBucket-$uniqueIdentifier.
Hope this helps.
Your code has the BucketName already specified:
"BucketName": {
"Fn::Join": [
"-",
[
{
"Ref": "ProductCode"
},
{
"Ref": "StackEnvironment"
},
"deployment",
{
"Fn::FindInMap": [
"RegionMap",
{
"Ref": "AWS::Region"
},
"ShortRegion"
]
}
]
]
},
The BucketName is a string, and since you are using 'Fn Join', it will be combined of the functions you are joining.
"The intrinsic function Fn::Join appends a set of values into a single value, separated by the specified delimiter. If a delimiter is the empty string, the set of values are concatenated with no delimiter."
Your bucket name if you don't change the defaults is:
cloudops-stage-deplyment-yourAwsRegion
If you change the default parameters, then both cloudops, and stage can be changed, deployment is hard coded, yourAWSRegion will be pulled from where the stack is running, and will be returned in short format via the Mapping.
To extend 'cloudquiz' answer, this is what it'd look in yaml format:
Resources:
SomeS3Bucket:
Type: AWS::S3::Bucket
Properties:
BucketName:
Fn::Join: ["-", ["yourbucketname", {'Fn::Sub': '${AWS::Region}'}, {'Fn::Sub': '${Stage}'}]]

make a new array from a nested object using Lodash

Here is my data
[
{
"properties": {
"key": {
"data": "companya data",
"company": "Company A"
}
},
"uniqueId" : 1
},
{
"properties": {
"key": {
"data": "companyb data",
"company": "Company B"
}
},
"uniqueId" : 2
},
{
"properties": {
"key": {
"data": "companyc data",
"company": "Company C"
}
},
"uniqueId" : 3
}
]
The format I need for my typeahead directive is below. I was trying to figure out the other post I made but still couldn't make it work. The best was to just make the nested collection as a simple collection of object.
[
{
"uniqueId" : 1,
"data": "companya data"
},
{
"uniqueId" : 2,
"data": "companyb data"
},
{
"uniqueId" : 3,
"data": "companyc data"
}
]
I got it!
console.log(
_(jsonData).map(function(obj) {
return {
d : obj.properties.key.data,
id : obj.uniqueId
}
})
.value()
);
You do not have to use the chaining feature of lodash as long as you are only performing one operation. You can simply use:
_.map(jsonData, function(obj) {
return {
d : obj.properties.key.data,
id : obj.uniqueId
}
});