I'm trying to upload a pandas dataframe of Twitter API data to a table in BigQuery.
Here's my dataframe prep code from Google Colab notebook:
!pip install --upgrade google-cloud-language
!pip install pandas-gbq -U
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
import os
# Imports Credential File:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "pp-004a-d61bf3451d85.json"
print("Service Account Key: {}".format(os.environ["GOOGLE_APPLICATION_CREDENTIALS"]))
!pip install --upgrade tweepy
interval = "15"
start = '2022-04-07'
end = '2022-04-12'
# Tweepy
searchQ = '(max muncy) -is:retweet lang:en'
intval_tw = "{}T".format(interval)
start_tw = '{}T00:00:00Z'.format(start)
end_tw = '{}T23:59:59Z'.format(end)
# index = pd.date_range('1/1/2000', periods=9, freq='T')
# D = calendar day frequency, H = hourly frequency, T, min = minutely frequency
# Library installs
import tweepy
# from twitter_authentication import bearer_token
import time
import pandas as pd
import requests
import json
import numpy as np
bearer_token = "BEARER_TOKEN"
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
gathered_tweets = []
for response in tweepy.Paginator(client.search_recent_tweets,
query = searchQ,
user_fields = ['name', 'description', 'username', 'profile_image_url', 'url', 'pinned_tweet_id', 'verified', 'created_at', 'location', 'public_metrics', 'entities'],
tweet_fields = ['public_metrics', 'created_at','lang', 'attachments', 'context_annotations', 'conversation_id', 'entities', 'geo', 'in_reply_to_user_id', 'possibly_sensitive', 'referenced_tweets', 'reply_settings', 'source'],
media_fields = ['duration_ms', 'media_key', 'preview_image_url', 'type', 'url', 'height', 'width', 'public_metrics'],
expansions = ['author_id', 'attachments.media_keys', 'entities.mentions.username', 'geo.place_id', 'in_reply_to_user_id', '', ''],
start_time = start_tw,
end_time = end_tw,
result = []
user_dict = {}
# Loop through each response object
for response in gathered_tweets:
# Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
for user in response.includes['users']:
user_dict[] = {'username': user.username,
'created_at': user.created_at,
'location': user.location,
'verified': user.verified,
'description': user.description,
'url': user.url,
'profile_image_url': user.profile_image_url,
'pinned_tweet': user.pinned_tweet_id,
'entities': user.entities,
'followers': user.public_metrics['followers_count'],
'total_tweets': user.public_metrics['tweet_count'],
'following': user.public_metrics['following_count'],
'listed': user.public_metrics['listed_count'],
'tweets': user.public_metrics['tweet_count']
for tweet in
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'username': author_info['username'],
'name': author_info['name'],
'author_followers': author_info['followers'],
'author_following': author_info['following'],
'author_tweets': author_info['tweets'],
'author_description': author_info['description'],
'author_url': author_info['url'],
'profile_image_url': author_info['profile_image_url'],
#'pinned_tweet': author_info['pinned_tweet_id'],
#'total_tweets': author_info['tweet_count'],
#'listed_count': author_info['listed_count'],
'entities': author_info['entities'],
'verified': author_info['verified'],
'account_created_at': author_info['created_at'],
'text': tweet.text,
'created_at': tweet.created_at,
'lang': tweet.lang,
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
'quotes': tweet.public_metrics['quote_count'],
'replied': tweet.in_reply_to_user_id,
'sensitive': tweet.possibly_sensitive,
'referenced_tweets': tweet.referenced_tweets,
'reply_settings': tweet.reply_settings,
'source': tweet.source
#'video_views': tweet.public_metrics['view_count']
dfTW00 = pd.DataFrame(result)
dfTW01 = dfTW00
# Create 'engagement' metric
dfTW01['engagement'] = dfTW01['retweets'] + dfTW01['replies'] + dfTW01['likes'] + dfTW01['quotes']
# Add 'tweets' column with value of 1
dfTW01['tweets'] = 1
# Engagement Rate calc
dfTW01['eng_rate'] = (dfTW01['tweets'] / dfTW01['engagement'])
# Add twitter link
dfTW01['base_url'] = ''
# base_url = ''
dfTW01['tweet_link'] = dfTW01['base_url'] + dfTW01['tweet_id'].astype(str)
# Imports the Google Cloud client library
from import language_v1
# Instantiates a client
client = language_v1.LanguageServiceClient()
def get_sentiment(text):
# The text to analyze
document = language_v1.Document(
# Detects the sentiment of the text
sentiment = client.analyze_sentiment(
request={"document": document}
return sentiment
dfTW01["sentiment"] = dfTW01["text"].apply(get_sentiment)
dfTW02 = dfTW01['sentiment'].astype(str).str.split(expand=True)
dfTW03 = pd.merge(dfTW01, dfTW02, left_index=True, right_index=True)
dfTW03.rename(columns = {1:'magnitude', 3:'score'}, inplace=True)
cols = ['magnitude', 'score']
dfTW03[cols] = dfTW03[cols].apply(pd.to_numeric, errors='coerce', axis=1)
def return_status(x):
if x >= .5:
return 'Positive'
elif x <= -.5:
return 'Negative'
return 'Neutral'
dfTW03['sentiment2'] = dfTW03['score'].apply(return_status)
What I've tried
This is what I've used for the upload (I've confirmed the project, dataset and table info are correct):
However, that method is returning this error message:
TypeError: <' not supported between instances of 'int' and 'str'
I've found several posts on SO addressing this, but I'm unable to relate them to my situation. (I thought various datatypes could be uploaded to a BigQuery table.)
Primarily, I'm not clear what the error message means by '<' not supported between instances of 'int' and 'str'.
Any input on what that would be greatly appreciated.
Below are the pandas dtypes in my dataframe if helpful.
Dataframe dtypes
Pandas dataframe dtypes:
author_id int64
username object
name object
author_followers int64
author_following int64
author_tweets int64
author_description object
author_url object
profile_image_url object
entities object
verified bool
account_created_at datetime64[ns, UTC]
text object
created_at datetime64[ns, UTC]
lang object
tweet_id int64
retweets int64
replies int64
likes int64
quotes int64
replied float64
sensitive bool
referenced_tweets object
reply_settings object
source object
engagement int64
tweets int64
eng_rate float64
base_url object
tweet_link object
sentiment object
0 object
magnitude float64
2 object
score float64
sentiment_rating float64
sentiment2 object
dtype: object
Instead of to_gbq() function from Pandas, you may try and use load_table_from_dataframe() function from BigQuery library in loading your dataframe to BigQuery.
Please see the below sample python code using load_table_from_dataframe():
import datetime
from import bigquery
import pandas
import pytz
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
table_id = ""
records = [
"title": "The Meaning of Life",
"release_year": 1983,
"length_minutes": 112.5,
"release_date": pytz.timezone("Europe/Paris")
.localize(datetime.datetime(1983, 5, 9, 13, 0, 0))
# Assume UTC timezone when a datetime object contains no timezone.
"dvd_release": datetime.datetime(2002, 1, 22, 7, 0, 0),
"title": "Monty Python and the Holy Grail",
"release_year": 1975,
"length_minutes": 91.5,
"release_date": pytz.timezone("Europe/London")
.localize(datetime.datetime(1975, 4, 9, 23, 59, 2))
"dvd_release": datetime.datetime(2002, 7, 16, 9, 0, 0),
"title": "Life of Brian",
"release_year": 1979,
"length_minutes": 94.25,
"release_date": pytz.timezone("America/New_York")
.localize(datetime.datetime(1979, 8, 17, 23, 59, 5))
"dvd_release": datetime.datetime(2008, 1, 14, 8, 0, 0),
"title": "And Now for Something Completely Different",
"release_year": 1971,
"length_minutes": 88.0,
"release_date": pytz.timezone("Europe/London")
.localize(datetime.datetime(1971, 9, 28, 23, 59, 7))
"dvd_release": datetime.datetime(2003, 10, 22, 10, 0, 0),
dataframe = pandas.DataFrame(
# In the loaded table, the column order reflects the order of the
# columns in the DataFrame.
# Optionally, set a named index, which can also be written to the
# BigQuery table.
["Q24980", "Q25043", "Q24953", "Q16403"], name="wikidata_id"
job_config = bigquery.LoadJobConfig(
# Specify a (partial) schema. All columns are always written to the
# table. The schema is used to assist in data type definitions.
# Specify the type of columns whose type cannot be auto-detected. For
# example the "title" column uses pandas dtype "object", so its
# data type is ambiguous.
bigquery.SchemaField("title", bigquery.enums.SqlTypeNames.STRING),
# Indexes are written if included in the schema by name.
bigquery.SchemaField("wikidata_id", bigquery.enums.SqlTypeNames.STRING),
# Optionally, set the write disposition. BigQuery appends loaded rows
# to an existing table by default, but with WRITE_TRUNCATE write
# disposition it replaces the table with the loaded data.
job = client.load_table_from_dataframe(
dataframe, table_id, job_config=job_config
) # Make an API request.
job.result() # Wait for the job to complete.
table = client.get_table(table_id) # Make an API request.
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
I'm trying entity extraction with spaCy and Pandas UDF (PySpark) but I get an error.
Using a UDF works without errors but is slow. What am I doing wrong?
Loading the model every time is to avoid load error - Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.
Working UDF:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
return ents
get_entities_udf = F.udf(__get_entities), T.ArrayType(T.StringType()))
Pandas UDF with error:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
return pd.Series(ents)
get_entities_udf = F.pandas_udf(lambda x: __get_entities(x), "array<string>", F.PandasUDFType.SCALAR)
Error message:
TypeError: Argument 'string'has incorrect type (expected str, got series)
Sample Spark DataFrame:
df = spark.createDataFrame([
['John Doe'],
['Jane Doe'],
['Microsoft Corporation'],
['Apple Inc.'],
New column:
df_new = df.withColumn('entity',get_entities_udf('name'))
You need to see the input as pd.Series instead of single value
I was able to get it working by refactoring the code a bit. Notice x.apply call which is pandas specific and applies function to a pd.Series.
def entities(x):
global nlp
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('name'))
| name| entity|
| John Doe|[PERSON]|
| Jane Doe|[PERSON]|
|Microsoft Corpora...| [ORG]|
| Apple Inc.| [ORG]|
I'm using: pyspark 3.1.1 and python 3.7
The answer above didn't work for me, I and spend quite some time making things work, so I thought I'd share the solution I came up with.
Setting things up
creating a sample of 16 random person and company names
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
names = []
fake = Faker()
for _ in range(8):
names.append(f"{} {fake.company_suffix()}")
df = spark.createDataFrame(names, StringType())
As it is
First, checking the current solution proposed. I'm just Adding a print statement upon loading the spacy model to see how many time we do load the model.
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
def entities(x):
global nlp
import spacy
nlp = load_spacy_model()
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('value'))
We can then see that the model is loaded 16 times, so one for every single entry we process. Not what I want.
Batch processing
Rewriting using the decorator introduce in spark 3.0+ that is using Type Hints (python 3.6+). Then our UDF is using the nlp.pipe() for batch processing the entire pd.Series
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
global nlp
nlp = load_spacy_model()
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
[ent.label_ for ent in doc.ents]
for doc in docs
return pd.Series(ents)
df_new = df.withColumn('entity',entities('value'))
In my case the model was loaded 4 times, that's better. It's each time a python worker is created to process a batch. So the number will depend how many cores is Spark using but more critically in my case: how much partitioned is our data. So it's yet to be optimum
broadcasting the nlp object
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
def entities(list_of_text: pd.Series) -> pd.Series:
nlp = boardcasted_nlp.value
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
[ent.label_ for ent in doc.ents]
for doc in docs
return pd.Series(ents)
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
Now the model is loaded only once then broadcasted to every python worker that is getting spawned.
The complete Code
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
# creating our set of fake person and company names
names = []
fake = Faker()
for _ in range(8):
names.append(f"{} {fake.company_suffix()}")
df = spark.createDataFrame(names, StringType())
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
# retrieving the shared nlp object
nlp = boardcasted_nlp.value
# batch processing our list of text
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
[ent.label_ for ent in doc.ents]
for doc in docs
return pd.Series(ents)
# we load the spacy model and broadcast it
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
|value |entity |
|Ferguson, Price and Green Ltd |[ORG, ORG, ORG] |
|Cassandra Goodman MD |[PERSON] |
|Solis Ltd LLC |[ORG] |
|Laurie Foster |[PERSON] |
|Lane-Vasquez Group |[ORG] |
|Matthew Wright |[PERSON] |
|Scott, Pugh and Rodriguez and Sons|[PERSON, PERSON, PERSON, PERSON]|
|Tina Cooke |[PERSON] |
|Watkins, Blake and Foster Ltd |[ORG] |
|Charles Reyes |[PERSON] |
|Cooper, Norris and Roberts PLC |[ORG] |
|Michael Tate |[PERSON] |
|Powell, Lawson and Perez and Sons |[PERSON, PERSON, PERSON, PERSON]|
|James Wolf PhD |[PERSON] |
|Greer-Swanson PLC |[ORG] |
|Nicholas Hale |[PERSON] |
I'm using beautiful soup and I'm getting the error, "AttributeError: 'NoneType' object has no attribute 'get_text'" and also "TypeError: 'NoneType' object is not subscriptable".
I know my code works when I use it to search for a single restaurant. However when I try to make a loop for all restaurants, then I get an error.
Here is my screen recording showing the problem.
The rest of the code can be found here:
# AttributeError: 'NoneType' object has no attribute 'get_text'
restaurant_address = yelp_containers[yelp_container].find("address", {
"class": 'lemon--address__373c0__2sPac'
print("restaurant_address: ", restaurant_address)
# TypeError: 'NoneType' object is not subscriptable
restaurant_starCount = yelp_containers[yelp_container].find("div", {
"class": "lemon--div__373c0__1mboc i-stars__373c0__30xVZ i-stars--regular-4__373c0__2R5IO border-color--default__373c0__2oFDT overflow--hidden__373c0__8Jq2I"
print("restaurant_starCount: ", restaurant_starCount)
# AttributeError: 'NoneType' object has no attribute 'text'
restaurant_district = yelp_containers[yelp_container].find("div", {
"class": "lemon--div__373c0__1mboc display--inline-block__373c0__25zhW border-color--default__373c0__2xHhl"
print("restaurant_district: ", restaurant_district)
You are getting the error because your selectors are too specific, and you don't check if the tag was found or not. One solution is loosen the selectors (the lemon--div-XXX... selectors will probably change in the near future anyway):
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import csv
import re
my_url = ''
uClient = uReq(my_url)
page_html =
bs = soup(page_html, "html.parser")
yelp_containers ='li:contains("All Results") ~ li:contains("read more")')
for idx, item in enumerate(yelp_containers, 1):
print("--- Restaurant number #", idx)
restaurant_title = item.h3.get_text(strip=True)
restaurant_title = re.sub(r'^[\d.\s]+', '', restaurant_title)
restaurant_address = item.select_one('[class*="secondaryAttributes"]').get_text(separator='|', strip=True).split('|')[1]
restaurant_numReview = item.select_one('[class*="reviewCount"]').get_text(strip=True)
restaurant_numReview = re.sub(r'[^\d.]', '', restaurant_numReview)
restaurant_starCount = item.select_one('[class*="stars"][aria-label]')['aria-label']
restaurant_starCount = re.sub(r'[^\d.]', '', restaurant_starCount)
pr = item.select_one('[class*="priceRange"]')
restaurant_price = pr.get_text(strip=True) if pr else '-'
restaurant_category = [a.get_text(strip=True) for a in'[class*="priceRange"] ~ span a')]
restaurant_district = item.select_one('[class*="secondaryAttributes"]').get_text(separator='|', strip=True).split('|')[-1]
print('-' * 80)
--- Restaurant number # 1
Fog Harbor Fish House
Pier 39
['Seafood', 'Bars']
Fisherman's Wharf
--- Restaurant number # 2
The House
1230 Grant Ave
['Asian Fusion']
North Beach/Telegraph Hill
...and so on.
Can't find module 'textacy' has no attribute 'Doc'
I am trying to extract verb phrases from spacy but there is such no library. Please help me how can I extract the verb phrases or adjective phrases using spacy. I want to do full shallow parsing.
def extract_named_nouns(row_series):
"""Combine nouns and non-numerical entities.
Keyword arguments:
row_series -- a Pandas Series object
ents = set()
idxs = set()
# remove duplicates and merge two lists together
for noun_tuple in row_series['nouns']:
for named_ents_tuple in row_series['named_ents']:
if noun_tuple[1] == named_ents_tuple[1]:
if noun_tuple[1] not in idxs:
return sorted(list(ents), key=lambda x: x[1])
def add_named_nouns(df):
"""Create new column in data frame with nouns and named ents.
Keyword arguments:
df -- a dataframe object
df['named_nouns'] = df.apply(extract_named_nouns, axis=1)
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
from textacy import io
#using spacy for nlp
nlp = en_core_web_sm.load()
sentence = 'The author is writing a new book.'
pattern = r'<VERB>?<ADV>*<VERB>+'
doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
# doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
lists = textacy.extract.pos_regex_matches(doc, pattern)
for list in lists:
module 'textacy' has no attribute 'Doc'
Try following the examples here:
It should be as simple as:
doc = textacy.make_spacy_doc("The author is writing a new book.", lang='en_core_web_sm')
You might look into just using spacy (without textacy) with its built-in Matcher instead (
spacy_lang = textacy.load_spacy_lang("en_core_web_en")
docx_textacy = spacy_lang(sentence)