Error: Message: Too many sources provided: 15285. Limit is 10000 - google-bigquery

I'm currently trying to run a Dataflow (Apache Beam, Python SDK) task to import a >100GB Tweet file into BigQuery, but running into Error: Message: Too many sources provided: 15285. Limit is 10000.
The task takes the tweets (JSON), extracts 5 relevant fields, transforms/sanitizes them a bit with some transforms and then write those values into BigQuery, which will be used for further processing.
There's Cloud Dataflow to BigQuery - too many sources but it seems to be caused by having a lot of different input files, whereas I have a single input file, so it doesn't seem relevant. Also the solutions mentioned there are rather cryptic and I'm not sure if/how I could apply them to my problem.
My guess is that BigQuery writes temporary files for each row or something before persisting them, and that's what's meant by "too many sources" ?
How can I fix this?
[Edit]
Code:
import argparse
import json
import logging
import apache_beam as beam
class JsonCoder(object):
"""A JSON coder interpreting each line as a JSON string."""
def encode(self, x):
return json.dumps(x)
def decode(self, x):
return json.loads(x)
def filter_by_nonempty_county(record):
if 'county_fips' in record and record['county_fips'] is not None:
yield record
def run(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('--input',
default='...',
help=('Input twitter json file specified as: '
'gs://path/to/tweets.json'))
parser.add_argument(
'--output',
required=True,
help=
('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
'or DATASET.TABLE.'))
known_args, pipeline_args = parser.parse_known_args(argv)
p = beam.Pipeline(argv=pipeline_args)
# read text file
#Read all tweets from given source file
read_tweets = "Read Tweet File" >> beam.io.ReadFromText(known_args.input, coder=JsonCoder())
#Extract the relevant fields of the source file
extract_fields = "Project relevant fields" >> beam.Map(lambda row: {'text': row['text'],
'user_id': row['user']['id'],
'location': row['user']['location'] if 'location' in row['user'] else None,
'geo':row['geo'] if 'geo' in row else None,
'tweet_id': row['id'],
'time': row['created_at']})
#check what type of geo-location the user has
has_geo_location_or_not = "partition by has geo or not" >> beam.Partition(lambda element, partitions: 0 if element['geo'] is None else 1, 2)
check_county_not_empty = lambda element, partitions: 1 if 'county_fips' in element and element['county_fips'] is not None else 0
#tweet has coordinates partition or not
coordinate_partition = (p
| read_tweets
| extract_fields
| beam.ParDo(TimeConversion())
| has_geo_location_or_not)
#lookup by coordinates
geo_lookup = (coordinate_partition[1] | "geo coordinates mapping" >> beam.ParDo(BeamGeoLocator())
| "filter successful geo coords" >> beam.Partition(check_county_not_empty, 2))
#lookup by profile
profile_lookup = ((coordinate_partition[0], geo_lookup[0])
| "join streams" >> beam.Flatten()
| "Lookup from profile location" >> beam.ParDo(ComputeLocationFromProfile())
)
bigquery_output = "write output to BigQuery" >> beam.io.Write(
beam.io.BigQuerySink(known_args.output,
schema='text:STRING, user_id:INTEGER, county_fips:STRING, tweet_id:INTEGER, time:TIMESTAMP, county_source:STRING',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
#file_output = "write output" >> beam.io.WriteToText(known_args.output, coder=JsonCoder())
output = ((profile_lookup, geo_lookup[1]) | "merge streams" >> beam.Flatten()
| "Filter entries without location" >> beam.FlatMap(filter_by_nonempty_county)
| "project relevant fields" >> beam.Map(lambda row: {'text': row['text'],
'user_id': row['user_id'],
'county_fips': row['county_fips'],
'tweet_id': row['tweet_id'],
'time': row['time'],
'county_source': row['county_source']})
| bigquery_output)
result = p.run()
result.wait_until_finish()
if __name__ == '__main__':
logging.getLogger().setLevel(logging.DEBUG)
run()
It's a little bit complicated, so it would probably take too much time to do it in bigquery directly. The code reads the tweets json, splits the PCollection by whether it's geotagged or not, if not it tries to look it up via profile location, maps to location to what's relevant for our GIS analysis and then writes it to BigQuery.

The number of files correspond to the number of shards the elements were processed in.
One trick to reducing this is to generate some random keys, and group the elements based on that before writing them out.
For example, you could use the following DoFn and PTransform in your pipeline:
class _RoundRobinKeyFn(beam.DoFn):
def __init__(self, count):
self.count = count
def start_bundle(self):
self.counter = random.randint(0, self.count - 1)
def process(self, element):
self.counter += 1
if self.counter >= self.count:
self.counter -= self.count
yield self.counter, element
class LimitBundles(beam.PTransform):
def __init__(self, count):
self.count = count
def expand(self, input):
return input
| beam.ParDo(_RoundRobinKeyFn(self.count))
| beam.GroupByKey()
| beam.FlatMap(lambda kv: kv[1])
You would just use this before the bigquery_output:
output = (# ...
| LimitBundles(10000)
| bigquery_output)
(Note that I just typed this in without testing it, so there are likely some Python typos.)

Related

What is the most efficient way of creating a tf.dataset from multiple json.gz files with multiple text records?

I have thousands of json.gz files, each with a variety of information about scientific papers. For each file, I have to extract the relevant information - e.g. title and labels - to make a dataset, then transform it to a tf.dataset. However, it is poorly efficient since I cannot filter the subjects directly or shuffle them in a single step.
I would like to read them using tf.dataset.interleave in order to shuffle them, but also to filter them according to specific labels.
Here is how I'm doing it up to now.
import tensorflow as tf
import pandas as pd
#For relevant feature extraction
def load_file(file):
#with gzip.open(bytes.decode(file), 'r') as fin: # 4. gzip
with gzip.open(file, 'r') as fin:
json_bytes = fin.read()
json_str = json_bytes.decode('utf-8') # 2. string (i.e. JSON)
bb = json.loads(json_str)
bb = pd.json_normalize(bb, 'items', ['indexed', ['title', 'publisher', 'type','indexed.date-parts', 'subject']],
errors='ignore')
bb.dropna(subset=['title', 'publisher', 'type','indexed.date-parts', 'subject'], inplace=True)
bb.subject = bb.subject.apply(lambda x: int(themes[list(set(x) & set(list(themes.keys())))[0]]) if len(list(set(x) & set(list(themes.keys()))))>0 else len(list(themes.keys()))+1)
bb.title = bb.title.str.join('').values
#bb['author'] = bb['author'].apply(lambda x: '; '.join([', '.join([i['given'], i['family']]) for i in x]))
bb['indexed.date-parts'] = bb['indexed.date-parts'].apply(lambda tpl: datetime.datetime.strptime('-'.join(str(x) for x in tpl[0]), '%Y-%m-%d').strftime('%Y-%m-%d'))
#bb = bb.sample(n=32, replace=True)
#return bb.title.str.join('').values, bb.subject.str.join(', ').values
return dict(bb[['title', 'publisher', 'type','indexed.date-parts', 'subject' ]])
file_list = ['file_2021_01/10625.json.gz',
'file_2021_01/23897.json.gz',
'file_2021_01/12169.json.gz',
'file_2021_01/427.json.gz',...]
filenames = tf.data.Dataset.list_files(file_list, shuffle=True)
dataset = filenames.apply(
tf.data.experimental.parallel_interleave(
lambda x: tf.data.Dataset.from_tensor_slices(tf.numpy_function(load_file, [x], (tf.int64))), cycle_length=1))
However, it results it a error:
InternalError: Unsupported object type dict
[[{{node PyFunc}}]] [Op:IteratorGetNext]
Thanks

Can not sink to BigQuery using Dataflow Apache Beam

I have 2 csv files: expeditions- 2010s.csv and peaks.csv with the join key 'peak_id'. I'm using notebook with Apache Beam in Dataflow to join them. Here is my code as below
def read_csv_file(readable_file):
import apache_beam as beam
import csv
import io
import datetime
# Open a channel to read the file from GCS
gcs_file = beam.io.filesystems.FileSystems.open(readable_file)
# Read it as csv, you can also use csv.reader
csv_dict = csv.DictReader(io.TextIOWrapper(gcs_file))
for row in csv_dict:
yield (row)
def run(argv=None):
import apache_beam as beam
import io
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
# This example file contains a total of only 10 lines.
# Useful for developing on a small set of data.
default='gs://bucket/folder/peaks.csv')
parser.add_argument(
'--input1',
dest='input1',
required=False,
help='Input file to read. This can be a local file or '
'a file in a Google Storage Bucket.',
# This example file contains a total of only 10 lines.
# Useful for developing on a small set of data.
default='gs://bucket/folder/expeditions- 2010s.csv')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
p = beam.Pipeline(options=PipelineOptions(pipeline_args))
input_p1 = (
p
| 'Read From GCS input1' >> beam.Create([known_args.input1])
| 'Parse csv file p1' >> beam.FlatMap(read_csv_file)
| 'Tuple p1' >> beam.Map(lambda e: (e["peakid"], {'peakid': e["peakid"], 'bcdate': e["bcdate"], 'smtdate':e["smtdate"]}))
)
input_p2 = (
p
| 'Read From GCS input2' >> beam.Create([known_args.input])
| 'Parse csv file p2' >> beam.FlatMap(read_csv_file)
| 'Tuple p2' >> beam.Map(lambda e: (e["peakid"], {'peakid': e["peakid"], 'pkname': e["pkname"], 'heightm':e["heightm"]}))
)
# CoGroupByKey: relational join of 2 or more key/values PCollection. It also accept dictionary of key value
output = (
(input_p1, input_p2)
| 'Join' >> beam.CoGroupByKey()
| 'Final Dict' >> beam.Map(lambda el: to_final_dict(el[1]))
# | beam.Map(print)
| 'Write To BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
table='project:dataset.expeditions',
method='FILE_LOADS',
custom_gcs_temp_location='gs://bucket/folder/temp',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
)
p.run().wait_until_finish()
def to_final_dict(list_tuple_of_tuple):
result = {}
for list_tuple in list_tuple_of_tuple:
for el in list_tuple:
result.update(el)
return result
# runner = DataflowRunner()
# runner.run_pipeline(p, options=options)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
I got the result before writing to BigQuery:
{'peakid': 'TKRG', 'bcdate': '4/24/10', 'smtdate': '5/5/10', 'pkname': 'Takargo', 'heightm': '6771'}
{'peakid': 'AMPG', 'bcdate': '4/5/10', 'smtdate': '', 'pkname': 'Amphu Gyabjen', 'heightm': '5630'}
{'peakid': 'AMAD', 'bcdate': '1/27/20', 'smtdate': '2/2/20', 'pkname': 'Ama Dablam', 'heightm': '6814'}
{'peakid': 'ANN1', 'bcdate': '3/27/19', 'smtdate': '4/23/19', 'pkname': 'Annapurna I', 'heightm': '8091'}
...
But it can't write to BigQuery with the error:
RuntimeError: BigQuery job beam_bq_job_LOAD_AUTOMATIC_JOB_NAME_LOAD_STEP_602_215864ba592a2e01f0c4e2157cc60c47_51de5de53b58409da70f699c833c4db5 failed. Error Result: <ErrorProto
location: 'gs://bucket/folder/temp/bq_load/4bbfc44d750c4af5ab376b2e3c3dedbd/project.dataset.expeditions/25905e46-db76-49f0-9b98-7d77131e3e0d'
message: 'Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 3; errors: 1. Please look into the errors[] collection for more details. File: gs://bucket/folder/temp/bq_load/4bbfc44d750c4af5ab376b2e3c3dedbd/project.dataset.expeditions/25905e46-db76-49f0-9b98-7d77131e3e0d'
reason: 'invalid'> [while running 'Write To BigQuery/BigQueryBatchFileLoads/WaitForDestinationLoadJobs']
I think the date format is not correct, use the following format for your date fields : YYYY-MM-DD => 2013-12-25 and normally it will solve your issue.

combine two lists to PCollection

I'm using Apache Beam. When writing to tfRecord I need to include the ID of the item along with its text and embedding.
The tutorial works with just one list of text but I also have a list of the IDs to match the list of text so I was wondering how I could pass the ID to the following function:
def to_tf_example(entries):
examples = []
text_list, embedding_list = entries
for i in range(len(text_list)):
text = text_list[i]
embedding = embedding_list[i]
features = {
# need to pass in ID here like so:
'id': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[ids.encode('utf-8')])),
'text': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[text.encode('utf-8')])),
'embedding': tf.train.Feature(
float_list=tf.train.FloatList(value=embedding.tolist()))
}
example = tf.train.Example(
features=tf.train.Features(
feature=features)).SerializeToString(deterministic=True)
examples.append(example)
return examples
My first thought was just to include the ids in the text column of my database and then extract them via slicing or regex or something but was wondering if there was a better way, I assume converting to a PCollection but don't know where to start. Here is the pipeline:
with beam.Pipeline(args.runner, options=options) as pipeline:
query_data = pipeline | 'Read data from BigQuery' >>
beam.io.Read(beam.io.BigQuerySource(project='my-project', query=get_data(args.limit), use_standard_sql=True))
# list of texts
text = query_data | 'get list of text' >> beam.Map(lambda x: x['text'])
# list of ids
ids = query_data | 'get list of ids' >> beam.Map(lambda x: x['id'])
( text
| 'Batch elements' >> util.BatchElements(
min_batch_size=args.batch_size, max_batch_size=args.batch_size)
| 'Generate embeddings' >> beam.Map(
generate_embeddings, args.module_url, args.random_projection_matrix)
| 'Encode to tf example' >> beam.FlatMap(to_tf_example)
| 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
file_path_prefix='{0}'.format(args.output_dir),
file_name_suffix='.tfrecords')
)
query_data | 'Convert to entity and write to datastore' >> beam.Map(
lambda input_features: create_entity(
input_features, args.kind))
I altered generate_embeddings to return List[int], List[string], List[List[float]] and then used the following function to pass the list of ids and text in:
def generate_embeddings_for_batch(batch, module_url, random_projection_matrix):
embeddings = generate_embeddings([x['id'] for x in batch], [x['text'] for x in batch], module_url, random_projection_matrix)
return embeddings
Here I'll assume generate_embeddings has the signature List[str], ... -> (List[str], List[List[float]])
What you want to do is avoid splitting your texts and ids into separate PCollections. So you might want to write something like
def generate_embeddings_for_batch(
batch,
module_url,
random_projection_matrix) -> Tuple[int, str, List[float]]:
embeddings = generate_embeddings(
[x['text'] for x in batch], module_url, random_projection_matrix)
text_to_embedding = dict(embeddings)
for id, text in batch:
yield x['id'], x['text'], text_to_embedding[x['text']]
From there you should be able to write to_tf_example.
It would probably make sense to look at using TFX.

Pandas UDF (PySpark) - Incorrect type Error

I'm trying entity extraction with spaCy and Pandas UDF (PySpark) but I get an error.
Using a UDF works without errors but is slow. What am I doing wrong?
Loading the model every time is to avoid load error - Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.
Working UDF:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
get_entities_udf = F.udf(__get_entities), T.ArrayType(T.StringType()))
Pandas UDF with error:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return pd.Series(ents)
get_entities_udf = F.pandas_udf(lambda x: __get_entities(x), "array<string>", F.PandasUDFType.SCALAR)
Error message:
TypeError: Argument 'string'has incorrect type (expected str, got series)
Sample Spark DataFrame:
df = spark.createDataFrame([
['John Doe'],
['Jane Doe'],
['Microsoft Corporation'],
['Apple Inc.'],
]).toDF("name",)
New column:
df_new = df.withColumn('entity',get_entities_udf('name'))
You need to see the input as pd.Series instead of single value
I was able to get it working by refactoring the code a bit. Notice x.apply call which is pandas specific and applies function to a pd.Series.
def entities(x):
global nlp
import spacy
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('name'))
df_new.show()
+--------------------+--------+
| name| entity|
+--------------------+--------+
| John Doe|[PERSON]|
| Jane Doe|[PERSON]|
|Microsoft Corpora...| [ORG]|
| Apple Inc.| [ORG]|
+--------------------+--------+
I'm using: pyspark 3.1.1 and python 3.7
The answer above didn't work for me, I and spend quite some time making things work, so I thought I'd share the solution I came up with.
Setting things up
creating a sample of 16 random person and company names
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
As it is
First, checking the current solution proposed. I'm just Adding a print statement upon loading the spacy model to see how many time we do load the model.
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
def entities(x):
global nlp
import spacy
nlp = load_spacy_model()
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('value'))
df_new.show()
We can then see that the model is loaded 16 times, so one for every single entry we process. Not what I want.
Batch processing
Rewriting using the decorator introduce in spark 3.0+ that is using Type Hints (python 3.6+). Then our UDF is using the nlp.pipe() for batch processing the entire pd.Series
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
global nlp
nlp = load_spacy_model()
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
df_new = df.withColumn('entity',entities('value'))
df_new.show()
In my case the model was loaded 4 times, that's better. It's each time a python worker is created to process a batch. So the number will depend how many cores is Spark using but more critically in my case: how much partitioned is our data. So it's yet to be optimum
broadcasting the nlp object
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
#pandas_udf(ArrayType(StringType()))
def entities(list_of_text: pd.Series) -> pd.Series:
nlp = boardcasted_nlp.value
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show()
Now the model is loaded only once then broadcasted to every python worker that is getting spawned.
The complete Code
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
# creating our set of fake person and company names
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
# retrieving the shared nlp object
nlp = boardcasted_nlp.value
# batch processing our list of text
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
# we load the spacy model and broadcast it
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show(truncate=False)
Result
+----------------------------------+--------------------------------+
|value |entity |
+----------------------------------+--------------------------------+
|Ferguson, Price and Green Ltd |[ORG, ORG, ORG] |
|Cassandra Goodman MD |[PERSON] |
|Solis Ltd LLC |[ORG] |
|Laurie Foster |[PERSON] |
|Lane-Vasquez Group |[ORG] |
|Matthew Wright |[PERSON] |
|Scott, Pugh and Rodriguez and Sons|[PERSON, PERSON, PERSON, PERSON]|
|Tina Cooke |[PERSON] |
|Watkins, Blake and Foster Ltd |[ORG] |
|Charles Reyes |[PERSON] |
|Cooper, Norris and Roberts PLC |[ORG] |
|Michael Tate |[PERSON] |
|Powell, Lawson and Perez and Sons |[PERSON, PERSON, PERSON, PERSON]|
|James Wolf PhD |[PERSON] |
|Greer-Swanson PLC |[ORG] |
|Nicholas Hale |[PERSON] |
+----------------------------------+--------------------------------+

Unable to reload data as a csv file from IPython Notebook

I have the following IPython Notebook, I am trying to access data base of movies from rotten tomatoes website.
But Rotten Tomatoes limits to 10,000 API requests a day
So I don't want to re-run this function every time when I restart the notebook, I am trying to save and reload this data as a CSV file. When I convert the data to a csv file I am getting this processing symbol[*] inside IPython notebook. After some time I am getting the following error
ConnectionError: HTTPConnectionPool(host='api.rottentomatoes.com', port=80): Max retries exceeded with url: /api/public/v1.0/movie_alias.json?apikey=5xr26r2qtgf9h3kcq5kt6y4v&type=imdb&id=0113845 (Caused by <class 'socket.gaierror'>: [Errno 11002] getaddrinfo failed)
Is this problem due to slow internet connection? Should I make some changes to my code? Kindly help me with this.
The code for the file is shown below:
%matplotlib inline
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
api_key = '5xr26r2qtgf9h3kcq5kt6y4v'
movie_id = '770672122' # toy story 3
url = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % movie_id
#these are "get parameters"
options = {'review_type': 'top_critic', 'page_limit': 20, 'page': 1, 'apikey': api_key}
data = requests.get(url, params=options).text
data = json.loads(data) # load a json string into a collection of lists and dicts
print json.dumps(data['reviews'][0], indent=2) # dump an object into a json string
from io import StringIO
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file,delimiter='\t')
movies
#print the first row
movies[['id', 'title', 'imdbID', 'year']]
def base_url():
return 'http://api.rottentomatoes.com/api/public/v1.0/'
def rt_id_by_imdb(imdb):
"""
Queries the RT movie_alias API. Returns the RT id associated with an IMDB ID,
or raises a KeyError if no match was found
"""
url = base_url() + 'movie_alias.json'
imdb = "%7.7i" % imdb
params = dict(id=imdb, type='imdb', apikey=api_key)
r = requests.get(url, params=params).text
r = json.loads(r)
return r['id']
def _imdb_review(imdb):
"""
Query the RT reviews API, to return the first page of reviews
for a movie specified by its IMDB ID
Returns a list of dicts
"""
rtid = rt_id_by_imdb(imdb)
url = base_url() + 'movies/{0}/reviews.json'.format(rtid)
params = dict(review_type='top_critic',
page_limit=20,
page=1,
country='us',
apikey=api_key)
data = json.loads(requests.get(url, params=params).text)
data = data['reviews']
data = [dict(fresh=r['freshness'],
quote=r['quote'],
critic=r['critic'],
publication=r['publication'],
review_date=r['date'],
imdb=imdb, rtid=rtid
) for r in data]
return data
def fetch_reviews(movies, row):
m = movies.irow(row)
try:
result = pd.DataFrame(_imdb_review(m['imdbID']))
result['title'] = m['title']
except KeyError:
return None
return result
def build_table(movies, rows):
dfs = [fetch_reviews(movies, r) for r in range(rows)]
dfs = [d for d in dfs if d is not None]
return pd.concat(dfs, ignore_index=True)
critics = build_table(movies, 3000)
critics.to_csv('critics.csv', index=False)
critics = pd.read_csv('critics.csv')