How to do automated test when robot state doesn't change by a start signal (ros published topic)? - testing

I have a ROS2 package that simulate a robot(machine) movements(actions) with Gazebo. Im able to launch the Gazebo and Rviz and correctly show he robot. So when I would like to simulate some robot (machine ) movements that are initialized with start and digging(machine movement) signals(that are actually ros2 topics) the robot(machine) is not performing (not digging).
So first this is the launch file
#!/usr/bin/env python3
import os
from os import path as osp
import shlex
import time
from backhoe_launch.launch_utils import include_launch
from launch import LaunchDescription
from launch.actions import DeclareLaunchArgument
from launch.actions.execute_process import ExecuteProcess
from launch.substitutions import LaunchConfiguration
from ament_index_python import get_package_prefix
from launch import logging
def generate_launch_description():
logging.reset()
ros_pkg_path = get_package_prefix('backhoe_ros_pkg').replace(
"/install/", "/src/"
)
default_exp_dir = osp.join(ros_pkg_path, "experiments")
log_manage_txt = 'log_dirs.txt'
log_manage_path = osp.join(default_exp_dir, log_manage_txt)
start_time = time.strftime('%Y%m%d%H%M%S')
exp_result_dir = osp.join(default_exp_dir, start_time)
rosbag_dir = osp.join(exp_result_dir, 'rosbag')
rosbag_dir_separate = osp.join(exp_result_dir, 'rosbag_separate')
if not os.path.isdir(exp_result_dir):
os.makedirs(exp_result_dir)
logging.launch_config.log_dir = exp_result_dir
arguments = [
DeclareLaunchArgument(
'namespace',
default_value=['backhoe'],
),
DeclareLaunchArgument(
'points',
default_value=['/backhoe/livox/lidar'],
),
DeclareLaunchArgument(
'inverted_imu',
default_value=['True'],
),
DeclareLaunchArgument(
'experiment_name',
default_value=[''],
),
DeclareLaunchArgument(
'speed_type',
default_value=['low'],
),
DeclareLaunchArgument(
'record_topic',
default_value=['some'],
),
DeclareLaunchArgument(
'rosbag_dir',
default_value=[rosbag_dir],
),
DeclareLaunchArgument(
'rosbag_dir_separate',
default_value=[rosbag_dir_separate],
),
DeclareLaunchArgument(
'is_system_test',
default_value=['False'],
),
]
state_publisher_node = include_launch(
'backhoe_launch',
'backhoe_state_publisher_launch.py',
launch_arguments=[
('namespace', LaunchConfiguration('namespace')),
('inverted_imu', LaunchConfiguration('inverted_imu')),
],
)
perception_bring_up_node = include_launch(
'backhoe_launch',
'perception_bringup_launch.py',
launch_arguments=[
('namespace', LaunchConfiguration('namespace')),
('points', LaunchConfiguration('points')),
],
)
digging_perception_node = include_launch(
'backhoe_launch',
'digging_perception_launch.py',
launch_arguments=[
('namespace', LaunchConfiguration('namespace')),
],
)
dumper_perception_service_node = include_launch(
'backhoe_launch',
'dumper_perception_service_launch.py',
launch_arguments=[
('namespace', LaunchConfiguration('namespace')),
('input_points1', '/filtered/points'),
('input_points2', '/dup/dumper_perception/input/points'),
],
)
dumper_termination_service_node = include_launch(
'backhoe_launch',
'dumper_termination_service_launch.py',
launch_arguments=[
('namespace', LaunchConfiguration('namespace')),
('input_points1', '/filtered/points'),
('input_points2', '/dup/dumper_termination/input/points'),
],
)
state_machine_node = include_launch(
'backhoe_state_machine',
'backhoe_state_machine_launch.py',
launch_arguments=[
('namespace', LaunchConfiguration('namespace')),
('experiment_name', LaunchConfiguration('experiment_name')),
('speed_type', LaunchConfiguration('speed_type')),
],
)
assert state_machine_node
rosbag_record = include_launch(
'backhoe_launch',
'rosbag_record_launch.py',
launch_arguments=[
('record_topic', LaunchConfiguration('record_topic')),
('directory', LaunchConfiguration('rosbag_dir')),
('directory_separate', LaunchConfiguration('rosbag_dir_separate')),
],
)
system_test_node = include_launch(
'backhoe_system_test',
'backhoe_system_test_launch.py',
launch_arguments=[
('namespace', LaunchConfiguration('namespace')),
('is_system_test', LaunchConfiguration('is_system_test')),
],
)
userinput_node = include_launch(
'backhoe_interface',
'userinput_launch.py',
launch_arguments=[
('namespace', LaunchConfiguration('namespace')),
],
)
nodes = [
system_test_node,
state_publisher_node,
perception_bring_up_node,
digging_perception_node,
dumper_perception_service_node,
dumper_termination_service_node,
state_machine_node,
rosbag_record,
userinput_node,
]
log_name = [
ExecuteProcess(
cmd=shlex.split(f'echo {start_time} >> {log_manage_path}'),
shell=True,
output='screen',
on_exit=nodes,
)
]
return LaunchDescription([*arguments, *log_name])
So then the first start signal is working and robot machine move for 20 degree. This is the first start signal:
ros2 topic pub /robot/start_signal std_msgs/msg/Bool '{data: True}' -1
And I can see on the terminal the state change:
[logger]: Entering DumperDetection [rexas_dashboard_node.py-31] [INFO]
[1667976821.134913838] [backhoe.dashboard]: Looking up
diagnostics_toplevel_state from dashboard
And the Gazebo is listening on /robot/action to Action type massages and execute them . So we can confirm that with
ros2 topic echo /robot/action
boom_volt: data: 1.4500000476837158 arm_volt: data:
1.4500000476837158 bucket_volt: data: 1.4500000476837158 revolve_volt: data: 1.4500000476837158 caterpillar_left_volt:
data: 1.4500000476837158 caterpillar_right_volt: data:
1.4500000476837158
So far so good. But when execute the second command( this command should start robot-machine digging) the robot(machine) is not doing anything and also the state doesnt change. So this is the command
ros2 topic pub /robot/start_digging_signal std_msgs/msg/Bool '{data: True}' -1
and can see the
ros2 topic echo /robot/action
is empty (nothing to publish) and also the state is not changing. So my question is how can a find or develop automated test and perform some debugging to find the error. I mean what kind of test I can use or develop for such cases when the state (state machine) is not working and some ros topics (signals) not working?
Thanks

Related

Passing subdomains to dash_leaflet.TileLayer

I have followed and adapted the LayersControl example from https://dash-leaflet.herokuapp.com/. I am trying to include a basemap from this (https://basemap.at/wmts/1.0.0/WMTSCapabilities.xml) source.
Upon running the code I get the error
Invalid argument subdomains passed into TileLayer.
Expected string.
Was supplied type array.
Value provided:
[
"map",
"map1",
"map2",
"map3",
"map4"
]
Looking into the documentation for dash_leaflet.TileLayer it says
- subdomains (string; optional):
Subdomains of the tile service. Can be passed in the form of one
string (where each letter is a subdomain name) or an array of
strings.
I think I understand the error message, but the error seems to disagree with to the docstring of TileLayer. I am not sure, if I have missed a detail here.
MWE:
import dash
from dash import html
import dash_leaflet as dl
url = "https://{s}.wien.gv.at/basemap/geolandbasemap/normal/google3857/{z}/{y}/{x}.png"
subdomains = ["map", "map1", "map2", "map3", "map4"]
name = "Geoland Basemap"
attribution = "basemap.at"
app = dash.Dash(__name__)
app.layout = html.Div(
dl.Map(
[
dl.LayersControl(
[
dl.BaseLayer(dl.TileLayer(), name="default map", checked=True),
dl.BaseLayer(
dl.TileLayer(
url=url, attribution=attribution, subdomains=subdomains
),
name=name,
checked=False,
),
]
)
],
zoom=7,
center=(47.3, 15.0),
),
style={"width": "100%", "height": "50vh", "margin": "auto", "display": "block"},
)
if __name__ == "__main__":
app.run_server(debug=True)
I am running
dash==2.6.1
dash_leaflet==0.1.23

Only csv file can import from GCS to Dataflow and BigQuery using Cloud Composer - Apache Airflow

I have a usecase: There are several files type in GCS like json, csv, txt,.. but I only want to choose csv file, use Dataflow in Python to transform them (such as rename fields,...), then write it to BigQuery. And the main requirement is use Airflow sensors without Cloud Fucntion to trigger them whenever a new csv file import to GCS.
Here is my code:
from datetime import timedelta, datetime
from airflow.models import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.providers.google.cloud.sensors.gcs import GCSObjectExistenceSensor
from airflow.contrib.operators.dataflow_operator import DataflowTemplateOperator
PROJECT = 'abc'
ZONE = 'us-central1-c'
BUCKET_NAME = 'bucket_testing'
BQ_DATASET = "abc.dataset_name"
LOCATION = "US"
DEFAULT_DAG_ARGS = {
'owner': 'gcs to bigquery using dataflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'schedule_interval': '#daily',
'dataflow_default_options': {
'project': PROJECT,
'zone': ZONE,
'stagingLocation': BUCKET_NAME
}
}
ENVIRONMENT = {
"bypassTempDirValidation": "false",
"maxWorkers": "20",
"numWorkers": "1",
"serviceAccountEmail": "abc8932097-compute#developer.gserviceaccount.com",
"tempLocation": "gs://composer_bucket",
"ipConfiguration": "WORKER_IP_UNSPECIFIED",
"additionalExperiments": [
"sideinput_io_metrics"
]
}
PARAMETERS = {
"outputTable": "abc:dataset_name.how_to_define_here", // how to got multiple table from multiple csv
"bigQueryLoadingTemporaryDirectory": "gs://composer_bucket",
}
with DAG('dag_sensor', default_args=DEFAULT_DAG_ARGS,dagrun_timeout=timedelta(hours=3),schedule_interval='00 * * * *') as dag:
gcs_file_exists = GCSObjectExistenceSensor(
task_id="gcs_object_sensor",
bucket=BUCKET_NAME,
object='*.csv',
mode='poke',
)
my_dataflow_job = DataflowTemplateOperator(
task_id='transfer_from_gcs_to_bigquery',
template='???', //what I need to write here
parameters=PARAMETERS,
environment=ENVIRONMENT,
dag=dag
)
my_bq_result = BigQueryOperator(
task_id='write_to_bq',
use_legacy_sql=False,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
dag = dag
)
gcs_file_exists >> my_dataflow_job >> my_bq_result
I am a newbie here, so please point me a detailed example.
Many thanks!

AWS Glue - Create date partition from timestamp field

Having a data frame with a timestamp field, like so:
timestamp
id
version
2022-01-01 01:02:00.000
1
2
2022-01-01 05:12:00.000
1
2
I've created a Glue job that is using ApplyMapping to save the data to a new S3 location. Currently I've added id and version partition by selecting those fields in the visual editor and my data is saved with the following structure: id=1/version=2/ I would like to parse the timestamp and extract the date value so the filesystem structure would be id=1/version=2/dt=2022-01-01/. However, in the visual editor I can only select the timestamp and cant perform any manipulation on the field. I'm guessing I need to change the code, but I'm not sure how.
Code:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node S3 bucket
S3bucket_node1 = glueContext.create_dynamic_frame.from_options(
format_options={},
connection_type="s3",
format="parquet",
connection_options={"paths": ["s3://my-data"], "recurse": True},
transformation_ctx="S3bucket_node1",
)
# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
frame=S3bucket_node1,
mappings=[
("timestamp", "timestamp", "timestamp", "timestamp"),
("id", "string", "id", "string"),
("version", "string", "version", "string"),
],
transformation_ctx="ApplyMapping_node2",
)
# Script generated for node S3 bucket
S3bucket_node3 = glueContext.write_dynamic_frame.from_options(
frame=ApplyMapping_node2,
connection_type="s3",
format="glueparquet",
connection_options={
"path": "s3://target-data",
"partitionKeys": ["id", "version"],
},
format_options={"compression": "gzip"},
transformation_ctx="S3bucket_node3",
)
job.commit()
Use the Map Class.
Add this method to your script
def AddDate(rec):
ts = str(rec["timestamp"])
rec["dt"] = ts[:10]
return rec
Insert the Map Transform after the ApplyMapping step.
Mapped_dyF = Map.apply(frame = ApplyMapping_node2, f = AddDate)
Update the write to S3 step, notice the change to frame and partitionKeys.
S3bucket_node3 = glueContext.write_dynamic_frame.from_options(
frame=Mapped_dyF,
connection_type="s3",
format="glueparquet",
connection_options={
"path": "s3://target-data",
"partitionKeys": ["id", "version", "dt"],
},
format_options={"compression": "gzip"},
transformation_ctx="S3bucket_node3",
)

Dataflow / BigQuery FILE_LOADS: Job did not reach to a terminal state after waiting indefinitely

I have following pipeline
with beam.Pipeline(options=pipeline_options) as pipeline:
(
p
| "Read Pub/Sub Messages" >> beam.io.ReadFromPubSub(subscription=pubsub_subscription).with_output_types(bytes)
| 'Fetch from API 1' >> beam.Map(fetch_1)
| 'Filter out invalid data' >> beam.Filter(lambda item: item is not None)
| 'Fetch from API 2' >> beam.Map(fetch_1)
| 'Filter out invalid data' >> beam.Filter(lambda item: item is not None)
| 'Parse Article to BQ json' >> beam.Map(parse_to_bq_json)
| 'WriteToBigQuery' >> beam.io.WriteToBigQuery(table='BQ_TABLE_NAME',
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
triggering_frequency=5
)
)
Which runs as expected when I run it with DirectRunner but ends with
Job did not reach to a terminal state after waiting indefinitely.
Nothing more, nothing less. Docs or other mentions about similar case very limited, so any feedback more than welcome.
Sample from last lines:
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.574Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs/_UnpickledSideInput(MapToVoidKey0.out.0)/GroupByKey/WriteStream into WriteToBigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs/_UnpickledSideInput(MapToVoidKey0.out.0)/PairWithVoidKey
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.603Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs/_UnpickledSideInput(MapToVoidKey0.out.0)/GroupByKey/MergeBuckets into WriteToBigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs/_UnpickledSideInput(MapToVoidKey0.out.0)/GroupByKey/ReadStream
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.637Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs/_UnpickledSideInput(MapToVoidKey0.out.0)/Values into WriteToBigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs/_UnpickledSideInput(MapToVoidKey0.out.0)/GroupByKey/MergeBuckets
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.672Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs/_UnpickledSideInput(MapToVoidKey0.out.0)/StreamingPCollectionViewWriter into WriteToBigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs/_UnpickledSideInput(MapToVoidKey0.out.0)/Values
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.705Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/PassTables/PassTables into WriteToBigQuery/BigQueryBatchFileLoads/WaitForCopyJobs/WaitForCopyJobs
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.739Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/AddUselessValue into WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/PassTables/PassTables
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.772Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/DeduplicateTables/WriteStream into WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/AddUselessValue
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.822Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/DeduplicateTables/MergeBuckets into WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/DeduplicateTables/ReadStream
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.848Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/GetTableNames into WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/DeduplicateTables/MergeBuckets
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.880Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/Delete into WriteToBigQuery/BigQueryBatchFileLoads/RemoveTempTables/GetTableNames
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.915Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/ImpulseEmptyPC/FlatMap(<lambda at core.py:3024>) into WriteToBigQuery/BigQueryBatchFileLoads/ImpulseEmptyPC/Impulse
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:50.939Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/ImpulseEmptyPC/Map(decode) into WriteToBigQuery/BigQueryBatchFileLoads/ImpulseEmptyPC/FlatMap(<lambda at core.py:3024>)
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:51.008Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/Flatten/FlattenReplace/WriteStream into WriteToBigQuery/BigQueryBatchFileLoads/TriggerLoadJobsWithTempTables/ParDo(TriggerLoadJobs)/ParDo(TriggerLoadJobs)
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:51.033Z: JOB_MESSAGE_DETAILED: Fusing consumer WriteToBigQuery/BigQueryBatchFileLoads/Flatten/FlattenReplace/WriteStream into WriteToBigQuery/BigQueryBatchFileLoads/TriggerLoadJobsWithoutTempTables/TriggerLoadJobsWithoutTempTables
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:51.165Z: JOB_MESSAGE_ERROR: Workflow failed.
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:51.205Z: JOB_MESSAGE_DETAILED: Cleaning up.
INFO:apache_beam.runners.dataflow.dataflow_runner:2021-01-22T17:27:51.252Z: JOB_MESSAGE_BASIC: Worker pool stopped.
Traceback (most recent call last):
File "/Applications/PyCharm CE.app/Contents/plugins/python-ce/helpers/pydev/pydevd.py", line 1477, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "/Applications/PyCharm CE.app/Contents/plugins/python-ce/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/Users/XXXX/dev/XXXX/app/app.py", line 151, in <module>
run(args, pipeline_args)
File "/Users/XXXX/dev/XXXX/app/app.py", line 108, in run
p.run().wait_until_finish()
File "/Users/XXXX/.virtualenvs/app/lib/python3.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.py", line 1675, in wait_until_finish
'Job did not reach to a terminal state after waiting indefinitely.')
AssertionError: Job did not reach to a terminal state after waiting indefinitely.
Edit 1: Adding output from console log (unfortunately not much info there):
{
textPayload: "Workflow failed."
insertId: "1rtvonbcgg5"
resource: {
type: "dataflow_step"
labels: {
project_id: "437008213460"
job_name: "app-test"
step_id: ""
region: "europe-west1"
job_id: "2021-01-22_11_22_27-2214838125974198028"
}
}
timestamp: "2021-01-22T19:22:37.425862432Z"
severity: "ERROR"
labels: {
dataflow.googleapis.com/job_id: "2021-01-22_11_22_27-2214838125974198028"
dataflow.googleapis.com/job_name: "app-test"
dataflow.googleapis.com/log_type: "system"
dataflow.googleapis.com/region: "europe-west1"
}
logName: "projects/some-project-eu/logs/dataflow.googleapis.com%2Fjob-message"
receiveTimestamp: "2021-01-22T19:22:39.086520796Z"
}
Edit 2: Adding simplified version:
def foo(stream_data):
return str(datetime.now())
with beam.Pipeline(options=pipeline_options) as p:
(
p
| "Read Pub/Sub Messages" >> beam.io.ReadFromPubSub(subscription=pubsub_subscription).with_output_types(bytes)
| 'Do foo' >> beam.Map(foo)
| 'WriteToBigQuery' >> beam.io.WriteToBigQuery(table=bq_project + ':' + bq_dataset + '.' + TABLE_NAME,
schema={"fields": [{"name": "foo_ts", "type": "TIMESTAMP"}]},
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
triggering_frequency=5,
)
)
and my run commands:
streaming_app.py
--input_subscription projects/awesome_project/subscriptions/sub-test
--runner DataflowRunner
--bq_project awesome_project
--bq_dataset awesome_dataset
--region europe-west1
--temp_location gs://awesome-nlp
--job_name hope-it-works-test
--setup_file ./setup.py
--max_num_workers 10
Edit 3: Adding also job id of one of the failed jobs: 2021-01-24_06_31_49-168256842937211337
Can you try comparing your code with sample Dataflow runner code given over example. As I cannot see your complete code, but if you try to fit your code over sample given above, It will run over Dataflow runner.
EDIT 1:
Please find below a working example:-
#------------Import Lib-----------------------#
import apache_beam as beam
from apache_beam import window
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
import os, sys, time
import argparse
import logging
from apache_beam.options.pipeline_options import SetupOptions
from datetime import datetime
#------------Set up BQ parameters-----------------------#
# Replace with Project Id
project = 'xxxxxxxxxxx'
Pubsub_subscription='projects/xxxxxxxxxxx/subscriptions/Pubsubdemo_subscription'
#plitting Of Records----------------------#
class Transaction_ECOM(beam.DoFn):
def process(self, element):
logging.info(element)
result = json.loads(element)
data_bkt = result.get('_bkt','null')
data_cd=result.get('_cd','null')
data_indextime=result.get('_indextime','0')
data_kv=result.get('_kv','null')
data_raw=result['_raw']
data_raw1=data_raw.replace("\n", "")
data_serial=result.get('_serial','null')
data_si = str(result.get('_si','null'))
data_sourcetype =result.get('_sourcetype','null')
data_subsecond = result.get('_subsecond','null')
data_time=result.get('_time','null')
data_host=result.get('host','null')
data_index=result.get('index','null')
data_linecount=result.get('linecount','null')
data_source=result.get('source','null')
data_sourcetype1=result.get('sourcetype','null')
data_splunk_server=result.get('splunk_server','null')
return [{"datetime_indextime": time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(int(data_indextime))), "_bkt": data_bkt, "_cd": data_cd, "_indextime": data_indextime, "_kv": data_kv, "_raw": data_raw1, "_serial": data_serial, "_si": data_si, "_sourcetype": data_sourcetype, "_subsecond": data_subsecond, "_time": data_time, "host": data_host, "index": data_index, "linecount": data_linecount, "source": data_source, "sourcetype": data_sourcetype1, "splunk_server": data_splunk_server}]
def run(argv=None, save_main_session=True):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args, streaming=True)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
p1 = beam.Pipeline(options=pipeline_options)
data_loading = (
p1
| "Read Pub/Sub Messages" >> beam.io.ReadFromPubSub(subscription=Pubsub_subscription)
)
project_id = "xxxxxxxxxxx"
dataset_id = 'test123'
table_schema_ECOM = ('datetime_indextime:DATETIME, _bkt:STRING, _cd:STRING, _indextime:STRING, _kv:STRING, _raw:STRING, _serial:STRING, _si:STRING, _sourcetype:STRING, _subsecond:STRING, _time:STRING, host:STRING, index:STRING, linecount:STRING, source:STRING, sourcetype:STRING, splunk_server:STRING')
# Persist to BigQuery
# WriteToBigQuery accepts the data as list of JSON objects
#---------------------Index = ITF----------------------------------------------------------------------------------------------------------------------
result = (
data_loading
| 'Clean-ITF' >> beam.ParDo(Transaction_ECOM())
| 'Write-ITF' >> beam.io.WriteToBigQuery(
table='CFF_ABC',
dataset=dataset_id,
project=project_id,
schema=table_schema_ECOM,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
))
result = p1.run()
result.wait_until_finish()
if __name__ == '__main__':
path_service_account = '/home/vibhg/Splunk/CFF/xxxxxxxxxxx-abcder125.json'
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = path_service_account
run()
It has few additional libraries so just ignore it.
Some key features which are :-
Set 'streaming' as 'True'
Subscription name should be of format ''projects/>xxxxxxxxxxx>/subscriptions/>subscription name>''
Sample data which is published on topic, that will be captured from Subscription is given below:-
{"_bkt": "A1E8-A5370FECA146", "_cd": "412:140787687", "_indextime": "1611584940", "_kv": "1", "_raw": "2021-01-25 14:28:59,126 INFO [com.abcd.mfs.builder.builders.BsLogEntryBuilder] [-] LogEntryType=\"BsCall\", fulName=\"EBCMFSSALES02\", BusinessServiceName=\"BsSalesOrderCreated\", Locality=\"NA\", Success=\"True\", BsExecutionTime=\"00:00:00.005\", OrderNo=\"374941817\", Locality=\"NA\" , [fulName=\"EBCMFSSALES02\"], [bsName=\"BsSalesOrderCreated\"], [userId=\"s-oitp-u-global\"], [userIdRegion=\"NA\"], [msgId=\"aaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbcccc\"], [msgIdSeq=\"2\"], [originator=\"ISOM\"] ", "_serial": "0", "_si": ["9ttr-bfc-gcp-europe-besti1", "itf"], "_sourcetype": "BBClog", "_subsecond": ".126", "_time": "2021-01-25 14:28:59.126 UTC", "host": "shampo-lx4821.abcd.com", "index": "itf", "linecount": "1", "source": "/opt/VRE/WebSphere/lickserv/profiles/appsrv01/logs/na-ebtree02_srv/log4j2.log", "sourcetype": "BBClog", "web_server": "9ttr-bfc-gcp-europe-besti1"}
[vibhg#aiclassificationdev8 jobrun]$ head -2 ITF_202101251435
{"_bkt": "itf~412~2EE5428B-7CEA-4C49-A1E8-A5370FECA146", "_cd": "412:140787687", "_indextime": "1611584940", "_kv": "1", "_raw": "2021-01-25 14:28:59,126 INFO [com.abcd.mfs.builder.builders.BsLogEntryBuilder] [-] LogEntryType=\"BsCall\", fulName=\"EBCMFSSALES02\", BusinessServiceName=\"BsSalesOrderCreated\", Locality=\"NA\", Success=\"True\", BsExecutionTime=\"00:00:00.005\", OrderNo=\"374941817\", Locality=\"NA\" , [fulName=\"EBCMFSSALES02\"], [bsName=\"BsSalesOrderCreated\"], [userId=\"s-oitp-u-global\"], [userIdRegion=\"NA\"], [msgId=\"aaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbcccc\"], [msgIdSeq=\"2\"], [originator=\"ISOM\"] ", "_serial": "0", "_si": ["9ttr-bfc-gcp-europe-besti1", "itf"], "_sourcetype": "BBClog", "_subsecond": ".126", "_time": "2021-01-25 14:28:59.126 UTC", "host": "shampo-lx4821.abcd.com", "index": "itf", "linecount": "1", "source": "/opt/VRE/WebSphere/lickserv/profiles/appsrv01/logs/na-ebtree02_srv/log4j2.log", "sourcetype": "BBClog", "web_server": "9ttr-bfc-gcp-europe-besti1"}
{"_bkt": "9-A1E8-A5370FECA146", "_cd": "412:140787671", "_indextime": "1611584940", "_kv": "1", "_raw": "2021-01-25 14:28:58,659 INFO [com.abcd.mfs.builder.builders.BsLogEntryBuilder] [-] LogEntryType=\"BsCall\", fulName=\"EBCMFSSALES02\", BusinessServiceName=\"BsCreateOrderV2\", BsExecutionTime=\"00:00:01.568\", OrderNo=\"374942155\", CountryCode=\"US\", ClientSystem=\"owfe-webapp\" , [fulName=\"EBCMFSSALES02\"], [bsName=\"BsCreateOrderV2\"], [userId=\"s-salja1-u-irssemal\"], [userIdRegion=\"NA\"], [msgId=\"6652311fece28966\"], [msgIdSeq=\"25\"], [originator=\"SellingApi\"] ", "_serial": "1", "_si": ["9ttr-bfc-gcp-europe-besti1", "itf"], "_sourcetype": "BBClog", "_subsecond": ".659", "_time": "2021-01-25 14:28:58.659 UTC", "host": "shampo-lx4821.abcd.com", "index": "itf", "linecount": "1", "source": "/opt/VRE/WebSphere/lickserv/profiles/appsrv01/logs/na-ebtree02_srv/log4j2.log", "sourcetype": "BBClog", "web_server": "9ttr-bfc-gcp-europe-besti1"}
You can execute script with following command :-
python script.py --region europe-west1 --project xxxxxxx --temp_location gs://temp/temp --runner DataflowRunner --job_name name
As It looks like you have missed to set Streaming parameter in your code.

How to ignore an unknown column when loading to bigQuery using Airflow?

I'm loading data from Google Storage to bigQuery using GoogleCloudStorageToBigQueryOperator
It may be that the Json file will have more columns than what I defined. In that case I want the load job continue - simply ignore this unrecognized column.
I tried to use the ignore_unknown_values argument but it didn't make any difference.
My operator:
def dc():
return [
{
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "storeId",
"type": "INTEGER",
"mode": "NULLABLE"
},
...
]
gcs_to_bigquery_st = GoogleCloudStorageToBigQueryOperator(
dag=dag,
task_id='load_to_BigQuery_stage',
bucket=GCS_BUCKET_ID,
destination_project_dataset_table=table_name_template_st,
source_format='NEWLINE_DELIMITED_JSON',
source_objects=[gcs_export_uri_template],
ignore_unknown_values = True,
schema_fields=dc(),
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_APPEND',
skip_leading_rows = 1,
google_cloud_storage_conn_id=CONNECTION_ID,
bigquery_conn_id=CONNECTION_ID
)
The error:
u'Error while reading data, error message: JSON parsing error in row
starting at position 0: No such field: shippingService.',
which is true. shippingService doesn't exist and it won't be added to the table.
How can I fix this?
Edit:
Removed the schema_fields=dc() from the operator:
gcs_to_bigquery_st = GoogleCloudStorageToBigQueryOperator(
dag=dag,
task_id='load_to_BigQuery_stage',
bucket=GCS_BUCKET_ID,
destination_project_dataset_table=table_name_template_st,
source_format='NEWLINE_DELIMITED_JSON',
source_objects=[gcs_export_uri_template],
ignore_unknown_values = True,
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_APPEND',
skip_leading_rows = 1,
google_cloud_storage_conn_id=CONNECTION_ID,
bigquery_conn_id=CONNECTION_ID
)
Still gives the same error.
This doesn't make scene.. It has command to ignore unknown values :(
The only reason I can think of is you are probably using Airflow 1.9. This feature was added in Airflow 1.10.
However, you can use it as follows in Airflow 1.9 by adding src_fmt_configs={'ignoreUnknownValues': True}:
gcs_to_bigquery_st = GoogleCloudStorageToBigQueryOperator(
dag=dag,
task_id='load_to_BigQuery_stage',
bucket=GCS_BUCKET_ID,
destination_project_dataset_table=table_name_template_st,
source_format='NEWLINE_DELIMITED_JSON',
source_objects=[gcs_export_uri_template],
src_fmt_configs={'ignoreUnknownValues': True},
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_APPEND',
skip_leading_rows = 1,
google_cloud_storage_conn_id=CONNECTION_ID,
bigquery_conn_id=CONNECTION_ID
)