Could someone please tell me whether Tensorboard supports exporting CSV files from the command line? The reason why I ask this is because I have a lots of logging directory and I am hoping to have a script file that automates the process. Thanks.
The API supports reading files programmatically. Here's an example of extracting data for a tag and saving it to a .csv in a similar format to those generated by tensorboard
import argparse
import numpy as np
import tensorflow as tf
def save_tag_to_csv(fn, tag='test_metric', output_fn=None):
if output_fn is None:
output_fn = '{}.csv'.format(tag.replace('/', '_'))
print("Will save to {}".format(output_fn))
sess = tf.InteractiveSession()
wall_step_values = []
with sess.as_default():
for e in tf.train.summary_iterator(fn):
for v in e.summary.value:
if v.tag == tag:
wall_step_values.append((e.wall_time, e.step, v.simple_value))
np.savetxt(output_fn, wall_step_values, delimiter=',', fmt='%10.5f')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('fn')
parser.add_argument('--tag', default='test_metric')
args = parser.parse_args()
save_tag_to_csv(args.fn, tag=args.tag)
Related
I need help with my code. I have built a recommendation system using cosine similarity on a colab and used pickle to serialized it. when I deserialized it inside a colab file, it works perfectly fine but when I deserialize it in a new colab file. it gives me an error
name 'data' is not defined
data is a variable that is initialized with my dataset which is outside of the class InstaPost.
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
import dill as pickle
data = pd.read_csv("/content/instaData.txt")
data
data = data[["Caption", "Hashtags"]]
captions = data["Caption"].tolist()
uni_tfidf = text.TfidfVectorizer(input=captions, stop_words="english")
uni_matrix = uni_tfidf.fit_transform(captions)
uni_sim = cosine_similarity(uni_matrix)
def recommend_post(x):
return ", ".join(data["Caption"].loc[x.argsort()[-7:-1]])
data["Recommended Post"] = [recommend_post(x) for x in uni_sim]
class InstaPost:
def Post(number):
count = 0
wordy = (data["Recommended Post"][number])
sentence = wordy.split(',')
for i in sentence:
count=count+1
print(count," ",i)
obj = InstaPost
obj.Post(1)
pickle_out = open("modelREC", "wb")
pickle.dump(obj, pickle_out)
pickle_out.close()
pickle_in = open("modelREC", "rb")
exe = pickle.load(pickle_in)
print(exe.Post(10))
NOTE: on a different file
print(exe.Post)
works
and give output
<function InstaPost.Post at 0x7efc0b4c3f70>
if I need to give the reference of the data than please guide me how should I do it. It will be a great help to me
Thanks in advance
I'm converting a CSV file into a TFRecords file like this:
File: ./dataset/csv/file.csv
feature_1, feture_2, output
1, 1, 1
2, 2, 2
3, 3, 3
import tensorflow as tf
import csv
import os
print(tf.__version__)
def create_csv_iterator(csv_file_path, skip_header):
with tf.io.gfile.GFile(csv_file_path) as csv_file:
reader = csv.reader(csv_file)
if skip_header: # Skip the header
next(reader)
for row in reader:
yield row
def _int64_feature(value):
"""Returns an int64_list from a bool / enum / int / uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def create_example(row):
"""
Returns a tensorflow.Example Protocol Buffer object.
"""
features = {}
for feature_index, feature_name in enumerate(["feature_1", "feture_2", "output"]):
feature_value = row[feature_index]
features[feature_name] = _int64_feature(int(feature_value))
return tf.train.Example(features=tf.train.Features(feature=features))
def create_tfrecords_file(input_csv_file):
"""
Creates a TFRecords file for the given input data
"""
output_tfrecord_file = input_csv_file.replace("csv", "tfrecords")
writer = tf.io.TFRecordWriter(output_tfrecord_file)
print("Creating TFRecords file at", output_tfrecord_file, "...")
for i, row in enumerate(create_csv_iterator(input_csv_file, skip_header=True)):
if len(row) == 0:
continue
example = create_example(row)
content = example.SerializeToString()
writer.write(content)
writer.close()
print("Finish Writing", output_tfrecord_file)
create_tfrecords_file("./dataset/csv/file.csv")
Then I'll read the generated TFRecords files using ImportExampleGen class:
import os
import absl
import tensorflow_model_analysis as tfma
tf.get_logger().propagate = False
from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip
context = InteractiveContext()
example_gen = tfx.components.ImportExampleGen(input_base="./dataset/tfrecords")
context.run(example_gen, enable_cache=True)
statistics_gen = tfx.components.StatisticsGen(
examples=example_gen.outputs['examples'])
context.run(statistics_gen, enable_cache=True)
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=False)
context.run(schema_gen, enable_cache=True)
File: ./transform.py
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
print(inputs)
return inputs
transform = tfx.components.Transform(
examples=example_gen.outputs['examples'],
schema=schema_gen.outputs['schema'],
module_file=os.path.abspath("./transform.py"))
context.run(transform, enable_cache=True)
In the preprocessing_fn function shows that inputs is a SparseTensor objects. My question is why? As far as I can tell, my dataset's samples are dense and they should be Tensor instead. Am I doing something wrong?
For anyone else who might be struggling with the same issue, I found the culprit. It's the SchemaGen class. This is how I was instantiating its object:
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=False)
I don't know what's the use case for asking SchemaGen class not to infer the shape of the features but the tutorial I was following had it set to False and I had just copied and pasted the same thing. Comparing with some other tutorials, I realized that it could be the reason why I was getting SparseTensor.
So, if you let SchemaGen infer the shape of your features or you load a hand crafted schema in which you've set the shapes yourself, you'll be getting a Tensor in your preprocessing_fn. But if the shapes are not set, the features will be instances of SparseTensor.
For the sake of completeness, this is the fixed snippet:
schema_gen = tfx.components.SchemaGen(
statistics=statistics_gen.outputs['statistics'],
infer_feature_shape=True)
according to the instructions in Colab I could get buffer & even take a pd.DataFrame from it (file is just example)...
# ... authentification
file_id = '1S1w0Z7g3bI1PGLPR49PW5VBRo7c_KYgU' # titanic
# loading data
import io
from googleapiclient.http import MediaIoBaseDownload
drive_service = build('drive', 'v3') # , credentials=creds
request = drive_service.files().get_media(fileId=file_id)
buf = io.BytesIO()
downloader = MediaIoBaseDownload(buf, request)
buf.seek(0)
import pandas as pd
df= pd.read_csv(buf);
print(df.head())
But have trouble with correct creation of dataFlow to Dataset - "buf" var is not working in =>
dataset = tf.data.experimental.make_csv_dataset(csv_file_path,
batch_size=100, num_epochs=1)
only "csv_file_path" as 1st arg. Is it possible in Colab to get IO from my GoogleDrive's csv-file into Dataset (used further in training)? And how to do it in a memory-efficient manner?..
P.S.
I understand that I perhaps can make file opened for all (in GoogleDrive) & get url to use the simple way:
#TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TRAIN_DATA_URL = "https://drive.google.com/file/d/1S1w0Z7g3bI1PGLPR49PW5VBRo7c_KYgU/view?usp=sharing"
train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
dataset = tf.data.experimental.make_csv_dataset(train_file_path, batch_size=100, num_epochs=1)
! but I DON'T need to share real file... How to save file confidential & get IO from it (in GoogleDrive) to tf.data.Dataset in Colab ? (preferably the shortest code - there will be much more code in real project tested in Colab)
drive.CreateFile HELPED (link) - as so as I understand that working in Colab - I am working in a separate environment (separate from my PC & I'net env)... So I tried (according link)
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# https://drive.google.com/file/d/1S1w0Z7g3bI1PGLPR49PW5VBRo7c_KYgU/view?usp=sharing
link = 'https://drive.google.com/open?id=1S1w0Z7g3bI1PGLPR49PW5VBRo7c_KYgU'
fluff, id = link.split('=')
print (id) # Verify that you have everything after '='
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('Filename.csv')
import tensorflow as tf
ds = tf.data.experimental.make_csv_dataset('Filename.csv', batch_size=100, num_epochs=1)
iterator = ds.as_numpy_iterator()
print(next(iterator))
it works for me. Thanks for the interest to the topic (if somebody tried)
even simplier
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')
_types = [float(), float(), float(), float(), str()]
_lines = tf.data.TextLineDataset('/content/drive/My Drive/iris.csv')
ds=_lines.skip(1).map(lambda x: tf.io.decode_csv(x, record_defaults=_types) )
ds0= ds.take(2)
print(*ds0.as_numpy_iterator(), sep='\n') # print list with sep => by rows.
OR from df: (and batched for memory economical usage)
import tensorflow as tf
# Load the Drive helper and mount
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')
df= pd.read_csv('/content/drive/My Drive/iris.csv', dtype = 'float32', converters = {'variety' : str}, nrows=20, decimal='.')
ds = tf.data.Dataset.from_tensor_slices(dict(df)) # if mixed types
ds = ds.shuffle(20, reshuffle_each_iteration=False ) # for train.ds ONLY!
ds = ds.batch(batch_size=4)
ds = ds.prefetch(4)
# labels
label= ds.map(lambda x: x['variety'])
print(list(label.as_numpy_iterator()))
# features
#features = ds.map(lambda x: (x['sepal.length'], x['sepal.width']))
# Or with dynamic keys:
features = ds.map(lambda x: (list(map(x.get, list(np.setdiff1d(list(x.keys()),['variety']))))))
print(list(features.as_numpy_iterator()))
with any Transformations in map...
Current version of tensorflow-serving try to load warmup request from assets.extra/tf_serving_warmup_requests file.
2018-08-16 16:05:28.513085: I tensorflow_serving/servables/tensorflow/saved_model_warmup.cc:83] No warmup data file found at /tmp/faster_rcnn_inception_v2_coco_2018_01_28_string_input_version-export/1/assets.extra/tf_serving_warmup_requests
I wonder if tensorflow provides common api to export request to the location or not? Or should we write request to the location manually?
At this point there is no common API for exporting the warmup data into the assets.extra. It's relatively simple to write a script (similar to below):
import tensorflow as tf
from tensorflow_serving.apis import model_pb2
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_log_pb2
def main():
with tf.python_io.TFRecordWriter("tf_serving_warmup_requests") as writer:
request = predict_pb2.PredictRequest(
model_spec=model_pb2.ModelSpec(name="<add here>"),
inputs={"examples": tf.make_tensor_proto([<add here>])}
)
log = prediction_log_pb2.PredictionLog(
predict_log=prediction_log_pb2.PredictLog(request=request))
writer.write(log.SerializeToString())
if __name__ == "__main__":
main()
We refered to the official doc
Specially, we used Classification instead of Prediction, so we altered that code to be
log = prediction_log_pb2.PredictionLog(
classify_log=prediction_log_pb2.ClassifyLog(request=<request>))
This is a complete example of an object detection system using a ResNet model. The prediction consist of an image.
import tensorflow as tf
import requests
import base64
from tensorflow.python.framework import tensor_util
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_log_pb2
IMAGE_URL = 'https://tensorflow.org/images/blogs/serving/cat.jpg'
NUM_RECORDS = 100
def get_image_bytes():
image_content = requests.get(IMAGE_URL, stream=True)
image_content.raise_for_status()
return image_content.content
def main():
"""Generate TFRecords for warming up."""
with tf.io.TFRecordWriter("tf_serving_warmup_requests") as writer:
image_bytes = get_image_bytes()
predict_request = predict_pb2.PredictRequest()
predict_request.model_spec.name = 'resnet'
predict_request.model_spec.signature_name = 'serving_default'
predict_request.inputs['image_bytes'].CopyFrom(
tensor_util.make_tensor_proto([image_bytes], tf.string))
log = prediction_log_pb2.PredictionLog(
predict_log=prediction_log_pb2.PredictLog(request=predict_request))
for r in range(NUM_RECORDS):
writer.write(log.SerializeToString())
if __name__ == "__main__":
main()
This script will create a file called “tf_serving_warmup_requests”
I moved this file to /your_model_location/resnet/1538687457/assets.extra/ and then restart my docker image to pickup the new changes.
I want to speed up matplotlib.savefig() for many figures by multiprocessing module, and trying to benchmark the performance between parallel and sequence.
Below is the codes:
# -*- coding: utf-8 -*-
"""
Compare the time of matplotlib savefig() in parallel and sequence
"""
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
import time
def gen_fig_list(n):
''' generate a list to contain n demo scatter figure object '''
plt.ioff()
fig_list = []
for i in range(n):
plt.figure();
dt = np.random.randn(5, 4);
fig = plt.scatter(dt[:,0], dt[:,1], s=abs(dt[:,2]*1000), c=abs(dt[:,3]*100)).get_figure()
fig.FM_figname = "img"+str(i)
fig_list.append(fig)
plt.ion()
return fig_list
def savefig_worker(fig, img_type, folder):
file_name = folder+"\\"+fig.FM_figname+"."+img_type
fig.savefig(file_name, format=img_type, dpi=fig.dpi)
return file_name
def parallel_savefig(fig_list, folder):
proclist = []
for fig in fig_list:
print fig.FM_figname,
p = multiprocessing.Process(target=savefig_worker, args=(fig, 'png', folder)) # cause error
proclist.append(p)
p.start()
for i in proclist:
i.join()
if __name__ == '__main__':
folder_1, folder_2 = 'Z:\\A1', 'Z:\\A2'
fig_list = gen_fig_list(10)
t1 = time.time()
parallel_savefig(fig_list,folder_1)
t2 = time.time()
print '\nMulprocessing time : %0.3f'%((t2-t1))
t3 = time.time()
for fig in fig_list:
savefig_worker(fig, 'png', folder_2)
t4 = time.time()
print 'Non_Mulprocessing time: %0.3f'%((t4-t3))
And I meet problem "This application has requested the Runtime to terminate it in an unusual way. Please contact the application's support team for more information." error caused by p = multiprocessing.Process(target=savefig_worker, args=(fig, 'png', folder)) .
Why ? And how to solve it ?
(Windows XP + Python: 2.6.1 + Numpy: 1.6.2 + Matplotlib: 1.2.0)
EDIT: (add error msg on python 2.7.3)
When run on IDLE of python 2.7.3, it gives below error msg:
>>>
img0
Traceback (most recent call last):
File "C:\Documents and Settings\Administrator\desktop\mulsavefig_pilot.py", line 61, in <module>
proc.start()
File "d:\Python27\lib\multiprocessing\process.py", line 130, in start
File "d:\Python27\lib\pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "d:\Python27\lib\pickle.py", line 748, in save_global
(obj, module, name))
PicklingError: Can't pickle <function notify_axes_change at 0x029F5030>: it's not found as matplotlib.backends.backend_qt4.notify_axes_change
EDIT: (My solution demo)
inspired by Matplotlib: simultaneous plotting in multiple threads
# -*- coding: utf-8 -*-
"""
Compare the time of matplotlib savefig() in parallel and sequence
"""
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
import time
def gen_data(fig_qty, bubble_qty):
''' generate data for fig drawing '''
dt = np.random.randn(fig_qty, bubble_qty, 4)
return dt
def parallel_savefig(draw_data, folder):
''' prepare data and pass to worker '''
pool = multiprocessing.Pool()
fig_qty = len(draw_data)
fig_para = zip(range(fig_qty), draw_data, [folder]*fig_qty)
pool.map(fig_draw_save_worker, fig_para)
return None
def fig_draw_save_worker(args):
seq, dt, folder = args
plt.figure()
fig = plt.scatter(dt[:,0], dt[:,1], s=abs(dt[:,2]*1000), c=abs(dt[:,3]*100), alpha=0.7).get_figure()
plt.title('Plot of a scatter of %i' % seq)
fig.savefig(folder+"\\"+'fig_%02i.png' % seq)
plt.close()
return None
if __name__ == '__main__':
folder_1, folder_2 = 'A1', 'A2'
fig_qty, bubble_qty = 500, 100
draw_data = gen_data(fig_qty, bubble_qty)
print 'Mulprocessing ... ',
t1 = time.time()
parallel_savefig(draw_data, folder_1)
t2 = time.time()
print 'Time : %0.3f'%((t2-t1))
print 'Non_Mulprocessing .. ',
t3 = time.time()
for para in zip(range(fig_qty), draw_data, [folder_2]*fig_qty):
fig_draw_save_worker(para)
t4 = time.time()
print 'Time : %0.3f'%((t4-t3))
print 'Speed Up: %0.1fx'%(((t4-t3)/(t2-t1)))
You can try to move all of the matplotlib code(including the import) to a function.
Make sure you don't have a import matplotlib or import matplotlib.pyplot as plt at the top of your code.
create a function that does all the matplotlib including the import.
Example:
import numpy as np
from multiprocessing import pool
def graphing_function(graph_data):
import matplotlib.pyplot as plt
plt.figure()
plt.hist(graph_data.data)
plt.savefig(graph_data.filename)
plt.close()
return
pool = Pool(4)
pool.map(graphing_function, data_list)
It is not really a bug, per-say, more of a limitation.
The explanation is in the last line of your error mesage:
PicklingError: Can't pickle <function notify_axes_change at 0x029F5030>: it's not found as matplotlib.backends.backend_qt4.notify_axes_change
It is telling you that elements of the figure objects can not be pickled, which is how MultiProcess passes data between the processes. The objects are pickled in the main processes, shipped as pickles, and then re-constructed on the other side. Even if you fixed this exact issue (maybe by using a different backend, or stripping off the offending function (which might break things in other ways)) I am pretty sure there are core parts of Figure, Axes, or Canvas objects that can not be pickled.
As #bigbug point to, an example of how to get around this limitation, Matplotlib: simultaneous plotting in multiple threads. The basic idea is that you push your entire plotting routine off to the sub-process so you only push numpy arrays an maybe some configuration information across the process boundry.