I have 2 csv files: expeditions- 2010s.csv and peaks.csv with the join key 'peak_id'. I'm using notebook with Apache Beam in Dataflow to join them. Here is my code as below
def read_csv_file(readable_file):
import apache_beam as beam
import csv
import io
import datetime
# Open a channel to read the file from GCS
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
# Read it as csv, you can also use csv.reader
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
for row in csv_dict:
yield (row)
def run(argv=None):
import apache_beam as beam
import io
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
# This example file contains a total of only 10 lines.
# Useful for developing on a small set of data.
default='gs://bucket/folder/peaks.csv')
parser.add_argument(
'--input1',
dest='input1',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
# This example file contains a total of only 10 lines.
# Useful for developing on a small set of data.
default='gs://bucket/folder/expeditions- 2010s.csv')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
input_p1 = (
p
| 'Read From GCS input1' >> beam.Create([known_args.input1])
| 'Parse csv file p1' >> beam.FlatMap(read_csv_file)
| 'Tuple p1' >> beam.Map(lambda e: (e["peakid"], {'peakid': e["peakid"], 'bcdate': e["bcdate"], 'smtdate':e["smtdate"]}))
)
input_p2 = (
p
| 'Read From GCS input2' >> beam.Create([known_args.input])
| 'Parse csv file p2' >> beam.FlatMap(read_csv_file)
| 'Tuple p2' >> beam.Map(lambda e: (e["peakid"], {'peakid': e["peakid"], 'pkname': e["pkname"], 'heightm':e["heightm"]}))
)
# CoGroupByKey: relational join of 2 or more key/values PCollection. It also accept dictionary of key value
output = (
(input_p1, input_p2)
| 'Join' >> beam.CoGroupByKey()
| 'Final Dict' >> beam.Map(lambda el: to_final_dict(el[1]))
# | beam.Map(print)
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
table='project:dataset.expeditions',
method='FILE_LOADS',
custom_gcs_temp_location='gs://bucket/folder/temp',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
)
p.run().wait_until_finish()
def to_final_dict(list_tuple_of_tuple):
result = {}
for list_tuple in list_tuple_of_tuple:
for el in list_tuple:
result.update(el)
return result
# runner = DataflowRunner()
# runner.run_pipeline(p, options=options)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
I got the result before writing to BigQuery:
{'peakid': 'TKRG', 'bcdate': '4/24/10', 'smtdate': '5/5/10', 'pkname': 'Takargo', 'heightm': '6771'}
{'peakid': 'AMPG', 'bcdate': '4/5/10', 'smtdate': '', 'pkname': 'Amphu Gyabjen', 'heightm': '5630'}
{'peakid': 'AMAD', 'bcdate': '1/27/20', 'smtdate': '2/2/20', 'pkname': 'Ama Dablam', 'heightm': '6814'}
{'peakid': 'ANN1', 'bcdate': '3/27/19', 'smtdate': '4/23/19', 'pkname': 'Annapurna I', 'heightm': '8091'}
...
But it can't write to BigQuery with the error:
RuntimeError: BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_602_215864ba592a2e01f0c4e2157cc60c47_51de5de53b58409da70f699c833c4db5 failed. Error Result: <ErrorProto
location: 'gs://bucket/folder/temp/bq_load/4bbfc44d750c4af5ab376b2e3c3dedbd/project.dataset.expeditions/25905e46-db76-49f0-9b98-7d77131e3e0d'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 3; errors: 1. Please look into the errors[] collection for more details. File: gs://bucket/folder/temp/bq_load/4bbfc44d750c4af5ab376b2e3c3dedbd/project.dataset.expeditions/25905e46-db76-49f0-9b98-7d77131e3e0d'
reason: 'invalid'> [while running 'Write To BigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs']
I think the date format is not correct, use the following format for your date fields : YYYY-MM-DD => 2013-12-25 and normally it will solve your issue.
Related
It seems like an error occur in bcdate, and I transform it to correct format but the error is still here, my code is below:
def transform_pandas(data):
import pandas as pd
import json
import datetime as dt
df = pd.DataFrame([data])
# Fill all columns with null if there is no data
columns = ['peakid', 'route1', 'bcdate', 'pkname', 'heightm']
df = df.reindex(columns, fill_value='null', axis=1)
df['bcdate'] = pd.to_datetime(df['bcdate'], errors='coerce').dt.strftime('%Y-%m-%d')
return json.loads(df.to_json(orient = 'records'))
and code to write to BigQuery:
output = (
(input_p1, input_p2)
| 'Join' >> beam.CoGroupByKey()
| 'Final Dict' >> beam.Map(lambda el: to_final_dict(el[1])) >> it got a result here
| 'Transformation' >> beam.Map(transform_pandas) >> the error happen in here
| beam.Map(print)
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
table='project:dataset.expeditions',
# schema='peakid:STRING,route1:STRING,bcdate:DATETIME,pkname:STRING,heightm:INTEGER',
method='FILE_LOADS',
custom_gcs_temp_location='gs://bucket/folder/temp',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
)
Here is my result
[{'peakid': 'ACHN', 'route1': '', 'bcdate': None, 'pkname': 'Aichyn', 'heightm': '6055'}]
[{'peakid': 'AGLE', 'route1': 'null', 'bcdate': None, 'pkname': 'Agole East', 'heightm': '6675'}]
[{'peakid': 'KCHS', 'route1': 'NW Ridge', 'bcdate': '2019-10-24', 'pkname': 'Kangchung Shar', 'heightm': '6063'}]
[{'peakid': 'LNAK', 'route1': 'SSE Ridge', 'bcdate': '2015-09-17', 'pkname': 'Lhonak', 'heightm': '6070'}]
[{'peakid': 'SPH1', 'route1': 'S Face', 'bcdate': '2017-04-14', 'pkname': 'Sharphu I', 'heightm': '6433'}]
...
But I got the error when I'm trying to sink to BigQuery:
BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_329_215864ba592a2e01f0c4e2157cc60c47_3a904aab56c3444bb56bda650a7404b3 failed. Error Result: <ErrorProto
location: 'gs://bucket/folder/temp/bq_load/1d12aed0bdcc463aa5350cf2cca2ef2e/project.dataset.expeditions/d67ef933-d15a-45b0-8df5-e2b5a217849d'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the errors[] collection for more details. File: gs://bucket/folder/temp/bq_load/1d12aed0bdcc463aa5350cf2cca2ef2e/project.dataset.expeditions/d67ef933-d15a-45b0-8df5-e2b5a217849d'
reason: 'invalid'> [while running 'Write To BigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs']
Your bcdate has the following date format YYYY-MM-DD in the result Dict but in your Bigquery schema and table you have a datetime type.
You can the check the doc to pass the correct format for datetime : https://cloud.google.com/bigquery/docs/reference/standard-sql/datetime_functions?hl=en
For example :
2017-05-26T00:00:00
You can also find the info on the beam documentation :
https://beam.apache.org/documentation/io/built-in/google-bigquery/
Example :
bigquery_data = [{
'string': 'abc',
'bytes': base64.b64encode(b'\xab\xac'),
'integer': 5,
'float': 0.5,
'numeric': Decimal('5'),
'boolean': True,
'timestamp': '2018-12-31 12:44:31.744957 UTC',
'date': '2018-12-31',
'time': '12:44:31',
'datetime': '2018-12-31T12:44:31',
'geography': 'POINT(30 10)'
}]
There is a lot of literature on the topic but none of those I could find use a GeoPandas reader.
My code purpose is to identify if a point is located into a polygone described in a .shp file stored in S3. It's then expected to return a boolean True or False.
I use python-lambda-local python module to test my python script located on PyCharm.
import geopandas as gpd
from geopandas.geoseries import *
import boto3
from io import BytesIO
def search(event, context):
dep = event['Dep']
arr = event['Arr']
point_1 = GeoSeries(dep)
point_2 = GeoSeries(arr)
s3 = boto3.client("s3")
bucket = "mybucket"
obj_key = "filename.shp"
# bytes_buffer = BytesIO()
# client.download_fileobj(Bucket=bucket, Key=obj_key, Fileobj=bytes_buffer)
obj = s3.download_file(Bucket=bucket, Key="filename.shp", Filename=obj_key)
geo = obj['body'].read().decode('ISO-8859-9')
# geo = bytes_buffer.get_key(obj_key).get_contents_as_string()
answer = gpd.read_file(geo)
print(answer)
As you can see in the code, I tried a few different lines to use IO and the reader() in different ways. Always unsuccessfully though.
#And this is the error message:#
MacBook-Pro:IdPolygons me$ python-lambda-local -l lib/ -f search -t 4 IdAircraft.py event.json
*This is the point I'm trying to identify inside or outside the polygon:*
[root - INFO - 2019-12-24 07:33:54,388] Event: {'Dep': '(40.7128, 74.0060)', 'Arr': '(48.8566, 2.3522)'}
[root - INFO - 2019-12-24 07:33:54,388] START RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125 Version:
[botocore.credentials - INFO - 2019-12-24 07:33:54,923] Found credentials in shared credentials file: ~/.aws/credentials stored
[root - INFO - 2019-12-24 07:33:55,576] END RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125
[root - INFO - 2019-12-24 07:33:55,577] REPORT RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125 Duration: 663.91 ms
[root - INFO - 2019-12-24 07:33:55,577] RESULT:
{
"errorMessage": "'NoneType' object has no attribute 'startswith'",
"stackTrace": [
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/lambda_local/main.py\", line 153, in execute\n result = func(event, context._activate())\n",
" File \"IdAircraft.py\", line 30, in search\n df1 = gpd.read_file(obj)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/geopandas/io/file.py\", line 77, in read_file\n with reader(path_or_bytes, **kwargs) as features:\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/env.py\", line 397, in wrapper\n return f(*args, **kwargs)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/__init__.py\", line 249, in open\n path = parse_path(fp)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/path.py\", line 132, in parse_path\n elif path.startswith('/vsi'):\n"
],
"errorType": "AttributeError"
}
Thank you for taking the time.
I want to make R datatype to Python datatype below is the whole code
def convert_datafiles(datasets_folder):
import rpy2.robjects
rpy2.robjects.numpy2ri.activate()
pandas2ri.activate()
for root, dirs, files in os.walk(datasets_folder):
for name in files:
# sort out .RData files
if name.endswith('.RData'):
name_ = os.path.splitext(name)[0]
name_path = os.path.join(datasets_folder, name_)
# creat sub-directory
if not os.path.exists(name_path):
os.makedirs(name_path)
file_path = os.path.join(root, name)
robj = robjects.r.load(file_path)
# check out subfiles in the data frame
for var in robj:
###### error happend right here
myRData = pandas2ri.ri2py_dataframe( var )
####error happend right here
# convert to DataFrame
if not isinstance(myRData, pd.DataFrame):
myRData = pd.DataFrame(myRData)
var_path = os.path.join(datasets_folder,name_,var+'.csv')
myRData.to_csv(var_path)
os.remove(os.path.join(datasets_folder, name)) # clean up
print ("=> Success!")
I want to make R datatype to pythone type, but the error keeps popping up like this : AttributeError: 'str' object has no attribute 'dtype'
How should I do to resolve this error?
The rpy2 documentation is somewhat incomplete when it comes to interaction with pandas, but unit tests will provide examples of conversion. For example:
rdataf = robjects.r('data.frame(a=1:2, '
' b=I(c("a", "b")), '
' c=c("a", "b"))')
with localconverter(default_converter + rpyp.converter) as cv:
pandas_df = robjects.conversion.ri2py(rdataf)
I'm currently trying to run a Dataflow (Apache Beam, Python SDK) task to import a >100GB Tweet file into BigQuery, but running into Error: Message: Too many sources provided: 15285. Limit is 10000.
The task takes the tweets (JSON), extracts 5 relevant fields, transforms/sanitizes them a bit with some transforms and then write those values into BigQuery, which will be used for further processing.
There's Cloud Dataflow to BigQuery - too many sources but it seems to be caused by having a lot of different input files, whereas I have a single input file, so it doesn't seem relevant. Also the solutions mentioned there are rather cryptic and I'm not sure if/how I could apply them to my problem.
My guess is that BigQuery writes temporary files for each row or something before persisting them, and that's what's meant by "too many sources" ?
How can I fix this?
[Edit]
Code:
import argparse
import json
import logging
import apache_beam as beam
class JsonCoder(object):
"""A JSON coder interpreting each line as a JSON string."""
def encode(self, x):
return json.dumps(x)
def decode(self, x):
return json.loads(x)
def filter_by_nonempty_county(record):
if 'county_fips' in record and record['county_fips'] is not None:
yield record
def run(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('--input',
default='...',
help=('Input twitter json file specified as: '
'gs://path/to/tweets.json'))
parser.add_argument(
'--output',
required=True,
help=
('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
'or DATASET.TABLE.'))
known_args, pipeline_args = parser.parse_known_args(argv)
p = beam.Pipeline(argv=pipeline_args)
# read text file
#Read all tweets from given source file
read_tweets = "Read Tweet File" >> beam.io.ReadFromText(known_args.input, coder=JsonCoder())
#Extract the relevant fields of the source file
extract_fields = "Project relevant fields" >> beam.Map(lambda row: {'text': row['text'],
'user_id': row['user']['id'],
'location': row['user']['location'] if 'location' in row['user'] else None,
'geo':row['geo'] if 'geo' in row else None,
'tweet_id': row['id'],
'time': row['created_at']})
#check what type of geo-location the user has
has_geo_location_or_not = "partition by has geo or not" >> beam.Partition(lambda element, partitions: 0 if element['geo'] is None else 1, 2)
check_county_not_empty = lambda element, partitions: 1 if 'county_fips' in element and element['county_fips'] is not None else 0
#tweet has coordinates partition or not
coordinate_partition = (p
| read_tweets
| extract_fields
| beam.ParDo(TimeConversion())
| has_geo_location_or_not)
#lookup by coordinates
geo_lookup = (coordinate_partition[1] | "geo coordinates mapping" >> beam.ParDo(BeamGeoLocator())
| "filter successful geo coords" >> beam.Partition(check_county_not_empty, 2))
#lookup by profile
profile_lookup = ((coordinate_partition[0], geo_lookup[0])
| "join streams" >> beam.Flatten()
| "Lookup from profile location" >> beam.ParDo(ComputeLocationFromProfile())
)
bigquery_output = "write output to BigQuery" >> beam.io.Write(
beam.io.BigQuerySink(known_args.output,
schema='text:STRING, user_id:INTEGER, county_fips:STRING, tweet_id:INTEGER, time:TIMESTAMP, county_source:STRING',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
#file_output = "write output" >> beam.io.WriteToText(known_args.output, coder=JsonCoder())
output = ((profile_lookup, geo_lookup[1]) | "merge streams" >> beam.Flatten()
| "Filter entries without location" >> beam.FlatMap(filter_by_nonempty_county)
| "project relevant fields" >> beam.Map(lambda row: {'text': row['text'],
'user_id': row['user_id'],
'county_fips': row['county_fips'],
'tweet_id': row['tweet_id'],
'time': row['time'],
'county_source': row['county_source']})
| bigquery_output)
result = p.run()
result.wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.DEBUG)
run()
It's a little bit complicated, so it would probably take too much time to do it in bigquery directly. The code reads the tweets json, splits the PCollection by whether it's geotagged or not, if not it tries to look it up via profile location, maps to location to what's relevant for our GIS analysis and then writes it to BigQuery.
The number of files correspond to the number of shards the elements were processed in.
One trick to reducing this is to generate some random keys, and group the elements based on that before writing them out.
For example, you could use the following DoFn and PTransform in your pipeline:
class _RoundRobinKeyFn(beam.DoFn):
def __init__(self, count):
self.count = count
def start_bundle(self):
self.counter = random.randint(0, self.count - 1)
def process(self, element):
self.counter += 1
if self.counter >= self.count:
self.counter -= self.count
yield self.counter, element
class LimitBundles(beam.PTransform):
def __init__(self, count):
self.count = count
def expand(self, input):
return input
| beam.ParDo(_RoundRobinKeyFn(self.count))
| beam.GroupByKey()
| beam.FlatMap(lambda kv: kv[1])
You would just use this before the bigquery_output:
output = (# ...
| LimitBundles(10000)
| bigquery_output)
(Note that I just typed this in without testing it, so there are likely some Python typos.)
I Have a Script for data extraction from some CSV files and bifurcating the Data into different excel files. I using Ipython for the that and I m sure it using CPython as the Default interpreter.
But the script is taking too much time for the whole process to finish. Can someone please help to how use that script using the PyPy as i heard it is much faster than CPython.
Script is something like this:
import pandas as pd
import xlsxwriter as xw
import csv
import pymsgbox as py
file1 = "vDashOpExel_Change_20150109.csv"
file2 = "vDashOpExel_T3Opened_20150109.csv"
path = "C:\Users\Abhishek\Desktop\Pandas Anlaysis"
def uniq(words):
seen = set()
for word in words:
l = word.lower()
if l in seen:
continue
seen.add(l)
yield word
def files(file_name):
df = pd.read_csv( path + '\\' + file_name, sep=',', encoding = 'utf-16')
final_frame = df.dropna(how='all')
file_list = list(uniq(list(final_frame['DOEClient'])))
return file_list, final_frame
def fill_data(f_list, frame1=None, frame2=None):
if f_list is not None:
for client in f_list:
writer = pd.ExcelWriter(path + '\\' + 'Accounts'+ '\\' + client + '.xlsx', engine='xlsxwriter')
if frame1 is not None:
data1 = frame1[frame1.DOEClient == client] # Filter the Data
data1.to_excel(writer,'Change',index=False, header=True) # Importing the Data to Excel File
if frame2 is not None:
data2 = frame2[frame2.DOEClient == client] # Filter the Data
data2.to_excel(writer,'Opened',index=False, header=True) # Importing the Data to Excel File
else:
py.alert('Please enter the First Parameter !!!', 'Error')
list1, frame1 = files(file1)
list2, frame2 = files(file2)
final_list = set(list1 + list2)