Apache Beam job (Python) using Tensorflow Transform is killed by Cloud Dataflow - tensorflow

I'm trying to run an Apache Beam job based on Tensorflow Transform on Dataflow but its killed. Someone has experienced that behaviour? This is a simple example with DirectRunner, that runs ok on my local but fails on Dataflow (I change the runner properly):
import os
import csv
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_transform as tft
from apache_beam.io import textio
from apache_beam.io import tfrecordio
from tensorflow_transform.beam import impl as beam_impl
from tensorflow_transform.beam import tft_beam_io
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
import apache_beam as beam
NUMERIC_FEATURE_KEYS = ['feature_'+str(i) for i in range(2000)]
def _create_raw_metadata():
column_schemas = {}
for key in NUMERIC_FEATURE_KEYS:
column_schemas[key] = dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(column_schemas))
return raw_data_metadata
def preprocessing_fn(inputs):
outputs={}
for key in NUMERIC_FEATURE_KEYS:
outputs[key] = tft.scale_to_0_1(inputs[key])
return outputs
def main():
output_dir = '/tmp/tmp-folder-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
RUNNER = 'DirectRunner'
with beam.Pipeline(RUNNER) as p:
with beam_impl.Context(temp_dir=output_dir):
raw_data_metadata = _create_raw_metadata()
_ = (raw_data_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join(output_dir, 'rawdata_metadata'), pipeline=p))
m = numpy_dataset = np.random.rand(100,2000)*100
raw_data = (p
| 'CreateTestDataset' >> beam.Create([dict(zip(NUMERIC_FEATURE_KEYS, m[i,:])) for i in range(m.shape[0])]))
raw_dataset = (raw_data, raw_data_metadata)
transform_fn = (raw_dataset | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))
_ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))
(transformed_data, transformed_metadata) = ((raw_dataset, transform_fn) | 'Transform' >> beam_impl.TransformDataset())
transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
_ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(os.path.join(output_dir, 'train'), file_name_suffix='.gz', coder=transformed_data_coder)
if __name__ == '__main__':
main()
Also, my production code (not shown) fail with the message: The job graph is too large. Please try again with a smaller job graph, or split your job into two or more smaller jobs.
Any hint?

The restriction on the pipeline description size is documented here:
https://cloud.google.com/dataflow/quotas#limits
There is a way around that, instead of creating stages for each tensor that goes into tft.scale_to_0_1 we could fuse them by first stacking them together, and then passing them into tft.scale_to_0_1 with 'elementwise=True'.
The result will be the same, because the min and max are computed per 'column' instead of across the whole tensor.
This would look something like this:
stacked = tf.stack([inputs[key] for key in NUMERIC_FEATURE_KEYS], axis=1)
scaled_stacked = tft.scale_to_0_1(stacked, elementwise=True)
for key, tensor in zip(NUMERIC_FEATURE_KEYS, tf.unstack(scaled_stacked, axis=1)):
outputs[key] = tensor

Related

Pandas UDF (PySpark) - Incorrect type Error

I'm trying entity extraction with spaCy and Pandas UDF (PySpark) but I get an error.
Using a UDF works without errors but is slow. What am I doing wrong?
Loading the model every time is to avoid load error - Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.
Working UDF:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
get_entities_udf = F.udf(__get_entities), T.ArrayType(T.StringType()))
Pandas UDF with error:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return pd.Series(ents)
get_entities_udf = F.pandas_udf(lambda x: __get_entities(x), "array<string>", F.PandasUDFType.SCALAR)
Error message:
TypeError: Argument 'string'has incorrect type (expected str, got series)
Sample Spark DataFrame:
df = spark.createDataFrame([
['John Doe'],
['Jane Doe'],
['Microsoft Corporation'],
['Apple Inc.'],
]).toDF("name",)
New column:
df_new = df.withColumn('entity',get_entities_udf('name'))
You need to see the input as pd.Series instead of single value
I was able to get it working by refactoring the code a bit. Notice x.apply call which is pandas specific and applies function to a pd.Series.
def entities(x):
global nlp
import spacy
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('name'))
df_new.show()
+--------------------+--------+
| name| entity|
+--------------------+--------+
| John Doe|[PERSON]|
| Jane Doe|[PERSON]|
|Microsoft Corpora...| [ORG]|
| Apple Inc.| [ORG]|
+--------------------+--------+
I'm using: pyspark 3.1.1 and python 3.7
The answer above didn't work for me, I and spend quite some time making things work, so I thought I'd share the solution I came up with.
Setting things up
creating a sample of 16 random person and company names
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
As it is
First, checking the current solution proposed. I'm just Adding a print statement upon loading the spacy model to see how many time we do load the model.
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
def entities(x):
global nlp
import spacy
nlp = load_spacy_model()
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('value'))
df_new.show()
We can then see that the model is loaded 16 times, so one for every single entry we process. Not what I want.
Batch processing
Rewriting using the decorator introduce in spark 3.0+ that is using Type Hints (python 3.6+). Then our UDF is using the nlp.pipe() for batch processing the entire pd.Series
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
global nlp
nlp = load_spacy_model()
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
df_new = df.withColumn('entity',entities('value'))
df_new.show()
In my case the model was loaded 4 times, that's better. It's each time a python worker is created to process a batch. So the number will depend how many cores is Spark using but more critically in my case: how much partitioned is our data. So it's yet to be optimum
broadcasting the nlp object
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
#pandas_udf(ArrayType(StringType()))
def entities(list_of_text: pd.Series) -> pd.Series:
nlp = boardcasted_nlp.value
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show()
Now the model is loaded only once then broadcasted to every python worker that is getting spawned.
The complete Code
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
# creating our set of fake person and company names
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
# retrieving the shared nlp object
nlp = boardcasted_nlp.value
# batch processing our list of text
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
# we load the spacy model and broadcast it
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show(truncate=False)
Result
+----------------------------------+--------------------------------+
|value |entity |
+----------------------------------+--------------------------------+
|Ferguson, Price and Green Ltd |[ORG, ORG, ORG] |
|Cassandra Goodman MD |[PERSON] |
|Solis Ltd LLC |[ORG] |
|Laurie Foster |[PERSON] |
|Lane-Vasquez Group |[ORG] |
|Matthew Wright |[PERSON] |
|Scott, Pugh and Rodriguez and Sons|[PERSON, PERSON, PERSON, PERSON]|
|Tina Cooke |[PERSON] |
|Watkins, Blake and Foster Ltd |[ORG] |
|Charles Reyes |[PERSON] |
|Cooper, Norris and Roberts PLC |[ORG] |
|Michael Tate |[PERSON] |
|Powell, Lawson and Perez and Sons |[PERSON, PERSON, PERSON, PERSON]|
|James Wolf PhD |[PERSON] |
|Greer-Swanson PLC |[ORG] |
|Nicholas Hale |[PERSON] |
+----------------------------------+--------------------------------+

could not convert string to float in python

i try to analysis the Principle Component from cvs file but when i run the code i get this error
C:\Users\Lenovo\Desktop>python pca.py
ValueError: could not convert string to float: Annee;NET;INT;SUB;LMT;DCT;IMM;EXP;VRD
this is my cvs file
i try to remove any space and any think
this is my python script, i don't know what i miss
Note: i run this code under python2.7
from sklearn.externals import joblib
import numpy as np
import glob
import os
import time
import numpy
my_matrix = numpy.loadtxt(open("pca.csv","rb"),delimiter= ",",skiprows=0)
def pca(dataMat, r, autoset_r=False, autoset_rate=0.9):
"""
purpose: principal components analysis
"""
print("Start to do PCA...")
t1 = time.time()
meanVal = np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVal
# normData = meanRemoved / np.std(dataMat)
covMat = np.cov(meanRemoved, rowvar=0)
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
eigValIndex = np.argsort(-eigVals)
if autoset_r:
r = autoset_eigNum(eigVals, autoset_rate)
print("autoset: take top {} of {} features".format(r, meanRemoved.shape[1]))
r_eigValIndex = eigValIndex[:r]
r_eigVect = eigVects[:, r_eigValIndex]
lowDDataMat = meanRemoved * r_eigVect
reconMat = (lowDDataMat * r_eigVect.T) + meanVal
t2 = time.time()
print("PCA takes %f seconds" %(t2-t1))
joblib.dump(r_eigVect, './pca_args_save/r_eigVect.eig')
joblib.dump(meanVal, './pca_args_save/meanVal.mean')
return lowDDataMat, reconMat
def autoset_eigNum(eigValues, rate=0.99):
eigValues_sorted = sorted(eigValues, reverse=True)
eigVals_total = eigValues.sum()
for i in range(1, len(eigValues_sorted)+1):
eigVals_sum = sum(eigValues_sorted[:i])
if eigVals_sum / eigVals_total >= rate:
break
return i
It seemed that NumPy has some problem parsing your index row to float.
Try setting skiprows = 1 in your np.readtxt command in order to skip the table header.

Numpy slogdet computation error

There appears to be a major difference between numpy's slogdet and the exact result when computing the log determinant of Vanermonde matrix.
I compare against the exact log determinant, see eg here for proof.
The minimal code to see this is:
A = np.power.outer(np.linspace(0,1,50),range(50))
print np.linalg.slogdet(A)[1]
s = 0
for v1 in np.linspace(0,1,50):
for v2 in np.linspace(0,1,50):
if v1>v2:
s+= np.log(v1-v2)
print s
Which yeilds:
-1191.88408998
-1706.99560647
I was wondering if there was a more accurate log determinant implementation which I could use in this situation but also in non-Vandermonde matrix situation.
You can use sympy and mpmath like this:
import numpy as np
import sympy as smp
import mpmath as mp
mp.mp.dps = 50
linspace1 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
A = np.power.outer(list(map(float,linspace1)),range(50))
first_print = smp.mpmath.mpf(np.linalg.slogdet(A)[1])
print(first_print)
s = 0
linspace2 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
linspace3 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
for v1 in linspace1:
for v2 in linspace2:
if v1>v2:
s+= mp.log(v1-v2)
print(s)
RESULTS
first_print = -1178.272517342130186079884879291057586669921875
s = -1706.9956064674289001970168329846189154212781094939

MemoryError when querying database from Process

I am trying to create a program with 3 processes that read from the same database. The code was working before I started introducing processes.
I am getting MemoryError when performing a select() from PeeWee, I suspect there is something wrong with sharing of resources. Minimal example:
models.py
from playhouse.pool import PooledSqliteExtDatabase
file_scanner_database = PooledSqliteExtDatabase(
None,
max_connections=32,
)
class FileModel(Model):
class Meta:
database = file_scanner_database
main.py
from file_scanner import FileScanner
from models import file_scanner_database
from models import FileModel
from multiprocessing import Process
def create_scanner_agent(data):
scanner = FileScanner(data)
scanner.start_scanner()
shared_info = {'db_location': '/absolute/path/to/database'}
file_scanner_database.init(shared_info['db_location'])
file_scanner_database.connect()
file_scanner_database.create_tables([FileModel], safe=True)
new_process = Process(
target=create_scanner_agent,
args=(shared_info,)
)
new_process.daemon = True
new_process.start()
try:
new_process.join()
except KeyboardInterrupt:
pass
new_process.terminate()
file_scanner.py
from models import file_scanner_database
from models import FileModel
class FileScanner:
def __init__(self, data):
for k, v in data.items():
setattr(self, k, v)
file_scanner_database.init(self.db_location)
file_scanner_database.connect()
def start_scanner(self):
while True:
# THIS IS WHERE THE PROGRAM CRASHES
for row in FileModel.select():
...
It looks like you're trying to access memory across a fork? Or some such craziness? I think the answer is that you're doing it wrong homie. Try opening your DB connection after the fork.

Is there a way to get tensorflow tf.Print output to appear in Jupyter Notebook output

I'm using the tf.Print op in a Jupyter notebook. It works as required, but will only print the output to the console, without printing in the notebook. Is there any way to get around this?
An example would be the following (in a notebook):
import tensorflow as tf
a = tf.constant(1.0)
a = tf.Print(a, [a], 'hi')
sess = tf.Session()
a.eval(session=sess)
That code will print 'hi[1]' in the console, but nothing in the notebook.
Update Feb 3, 2017
I've wrapped this into memory_util package. Example usage
# install memory util
import urllib.request
response = urllib.request.urlopen("https://raw.githubusercontent.com/yaroslavvb/memory_util/master/memory_util.py")
open("memory_util.py", "wb").write(response.read())
import memory_util
sess = tf.Session()
a = tf.random_uniform((1000,))
b = tf.random_uniform((1000,))
c = a + b
with memory_util.capture_stderr() as stderr:
sess.run(c.op)
print(stderr.getvalue())
** Old stuff**
You could reuse FD redirector from IPython core. (idea from Mark Sandler)
import os
import sys
STDOUT = 1
STDERR = 2
class FDRedirector(object):
""" Class to redirect output (stdout or stderr) at the OS level using
file descriptors.
"""
def __init__(self, fd=STDOUT):
""" fd is the file descriptor of the outpout you want to capture.
It can be STDOUT or STERR.
"""
self.fd = fd
self.started = False
self.piper = None
self.pipew = None
def start(self):
""" Setup the redirection.
"""
if not self.started:
self.oldhandle = os.dup(self.fd)
self.piper, self.pipew = os.pipe()
os.dup2(self.pipew, self.fd)
os.close(self.pipew)
self.started = True
def flush(self):
""" Flush the captured output, similar to the flush method of any
stream.
"""
if self.fd == STDOUT:
sys.stdout.flush()
elif self.fd == STDERR:
sys.stderr.flush()
def stop(self):
""" Unset the redirection and return the captured output.
"""
if self.started:
self.flush()
os.dup2(self.oldhandle, self.fd)
os.close(self.oldhandle)
f = os.fdopen(self.piper, 'r')
output = f.read()
f.close()
self.started = False
return output
else:
return ''
def getvalue(self):
""" Return the output captured since the last getvalue, or the
start of the redirection.
"""
output = self.stop()
self.start()
return output
import tensorflow as tf
x = tf.constant([1,2,3])
a=tf.Print(x, [x])
redirect=FDRedirector(STDERR)
sess = tf.InteractiveSession()
redirect.start();
a.eval();
print "Result"
print redirect.stop()
I ran into the same problem and got around it by using a function like this in my notebooks:
def tf_print(tensor, transform=None):
# Insert a custom python operation into the graph that does nothing but print a tensors value
def print_tensor(x):
# x is typically a numpy array here so you could do anything you want with it,
# but adding a transformation of some kind usually makes the output more digestible
print(x if transform is None else transform(x))
return x
log_op = tf.py_func(print_tensor, [tensor], [tensor.dtype])[0]
with tf.control_dependencies([log_op]):
res = tf.identity(tensor)
# Return the given tensor
return res
# Now define a tensor and use the tf_print function much like the tf.identity function
tensor = tf_print(tf.random_normal([100, 100]), transform=lambda x: [np.min(x), np.max(x)])
# This will print the transformed version of the tensors actual value
# (which was summarized to just the min and max for brevity)
sess = tf.InteractiveSession()
sess.run([tensor])
sess.close()
FYI, using a logger instead of calling "print" in my custom function worked wonders for me as the stdout is often buffered by jupyter and not shown before "Loss is Nan" kind of errors -- which was the whole point in using that function in the first place in my case.
You can check the terminal where you launched the jupyter notebook to see the message.
import tensorflow as tf
tf.InteractiveSession()
a = tf.constant(1)
b = tf.constant(2)
opt = a + b
opt = tf.Print(opt, [opt], message="1 + 2 = ")
opt.eval()
In the terminal, I can see:
2018-01-02 23:38:07.691808: I tensorflow/core/kernels/logging_ops.cc:79] 1 + 2 = [3]
A simple way, tried it in regular python, but not jupyter yet.
os.dup2(sys.stdout.fileno(), 1)
os.dup2(sys.stdout.fileno(), 2)
Explanation is here: In python, how to capture the stdout from a c++ shared library to a variable
The issue that I faced was that one can't run a session inside a Tensorflow Graph, like in the training or in the evaluation.
That's why the options to use sess.run(opt) or opt.eval() were not a solution for me.
The best thing was to use tf.Print() and redirect the logging to an external file.
I did this using a temporal file, which I transferred to a regular file like this:
STDERR=2
import os
import sys
import tempfile
class captured:
def __init__(self, fd=STDERR):
self.fd = fd
self.prevfd = None
def __enter__(self):
t = tempfile.NamedTemporaryFile()
self.prevfd = os.dup(self.fd)
os.dup2(t.fileno(), self.fd)
return t
def __exit__(self, exc_type, exc_value, traceback):
os.dup2(self.prevfd, self.fd)
with captured(fd=STDERR) as tmp:
...
classifier.evaluate(input_fn=input_fn, steps=100)
with open('log.txt', 'w') as f:
print(open(tmp.name).read(), file=f)
And then in my evaluation I do:
a = tf.constant(1)
a = tf.Print(a, [a], message="a: ")