How to create my own handwriting data set like IAM dataset - tensorflow

I need to create my own handwritten character dataset the format is just like the Iam Handwriting Database. I don't know how to create the dataset just like that, and I need you can check the data set format from their site I need to create data/ascii/words.txt and data/words/

There isn't instruction for creating IAM Handwriting Database. But you can find here: Build a Handwritten Text Recognition System using TensorFlow.
import os
import numpy as np
import cv2
class DataProvider():
"this class creates machine-written text for a word list. TODO: change getNext() to return your samples."
def __init__(self, wordList):
self.wordList = wordList
self.idx = 0
def hasNext(self):
return self.idx < len(self.wordList)
def getNext(self):
img = np.ones((32, 128), np.uint8)*255
word = self.wordList[self.idx]
self.idx += 1
cv2.putText(img,word,(2,20), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0), 1, cv2.LINE_AA)
return (word, img)
def createIAMCompatibleDataset(dataProvider):
"this function converts the passed dataset to an IAM compatible dataset"
# create files and directories
f = open('words.txt', 'w+')
if not os.path.exists('sub'):
os.makedirs('sub')
if not os.path.exists('sub/sub-sub'):
os.makedirs('sub/sub-sub')
# go through data and convert it to IAM format
ctr = 0
while dataProvider.hasNext():
sample = dataProvider.getNext()
# write img
cv2.imwrite('sub/sub-sub/sub-sub-%d.png'%ctr, sample[1])
# write filename, dummy-values and text
line = 'sub-sub-%d'%ctr + ' X X X X X X X ' + sample[0] + '\n'
f.write(line)
ctr += 1
if __name__ == '__main__':
words = ['some', 'words', 'for', 'which', 'we', 'create', 'text-images']
dataProvider = DataProvider(words)
createIAMCompatibleDataset(dataProvider)
The source code made by Harald Scheidl.

Related

What is the most efficient way of creating a tf.dataset from multiple json.gz files with multiple text records?

I have thousands of json.gz files, each with a variety of information about scientific papers. For each file, I have to extract the relevant information - e.g. title and labels - to make a dataset, then transform it to a tf.dataset. However, it is poorly efficient since I cannot filter the subjects directly or shuffle them in a single step.
I would like to read them using tf.dataset.interleave in order to shuffle them, but also to filter them according to specific labels.
Here is how I'm doing it up to now.
import tensorflow as tf
import pandas as pd
#For relevant feature extraction
def load_file(file):
#with gzip.open(bytes.decode(file), 'r') as fin: # 4. gzip
with gzip.open(file, 'r') as fin:
json_bytes = fin.read()
json_str = json_bytes.decode('utf-8') # 2. string (i.e. JSON)
bb = json.loads(json_str)
bb = pd.json_normalize(bb, 'items', ['indexed', ['title', 'publisher', 'type','indexed.date-parts', 'subject']],
errors='ignore')
bb.dropna(subset=['title', 'publisher', 'type','indexed.date-parts', 'subject'], inplace=True)
bb.subject = bb.subject.apply(lambda x: int(themes[list(set(x) & set(list(themes.keys())))[0]]) if len(list(set(x) & set(list(themes.keys()))))>0 else len(list(themes.keys()))+1)
bb.title = bb.title.str.join('').values
#bb['author'] = bb['author'].apply(lambda x: '; '.join([', '.join([i['given'], i['family']]) for i in x]))
bb['indexed.date-parts'] = bb['indexed.date-parts'].apply(lambda tpl: datetime.datetime.strptime('-'.join(str(x) for x in tpl[0]), '%Y-%m-%d').strftime('%Y-%m-%d'))
#bb = bb.sample(n=32, replace=True)
#return bb.title.str.join('').values, bb.subject.str.join(', ').values
return dict(bb[['title', 'publisher', 'type','indexed.date-parts', 'subject' ]])
file_list = ['file_2021_01/10625.json.gz',
'file_2021_01/23897.json.gz',
'file_2021_01/12169.json.gz',
'file_2021_01/427.json.gz',...]
filenames = tf.data.Dataset.list_files(file_list, shuffle=True)
dataset = filenames.apply(
tf.data.experimental.parallel_interleave(
lambda x: tf.data.Dataset.from_tensor_slices(tf.numpy_function(load_file, [x], (tf.int64))), cycle_length=1))
However, it results it a error:
InternalError: Unsupported object type dict
[[{{node PyFunc}}]] [Op:IteratorGetNext]
Thanks

Text classification using embedding for two columns of dataset

I am working on a project where i am using mental health related subreddit posts containing two feature columns (text, title) and a label column (Subreddit).
I want to use LSTM for classification where i need to create embedding matrix for both the columns in short need both columns for text classification but i cannot find the way to embed both columns.
Code i am using for text sequences is
text_sequences_train = token.texts_to_sequences(preprocessed_text_train)
title_sequences_train = token.texts_to_sequences(preprocessed_title_train)
#print(sequences_train)
train=np.hstack(text_sequences_train+title_sequences_train)
train.reshape(1,train.shape[0])
train_seq_x=pad_sequences(train, maxlen=300)
text_sequences_test = token.texts_to_sequences(preprocessed_text_test)
title_sequences_test = token.texts_to_sequences(preprocessed_title_test)
#print(sequences_train)
test=np.hstack(text_sequences_test+title_sequences_test)
test.reshape(1,test.shape[0])
test_seq_x=pad_sequences(test, maxlen=300)
text_sequences_val = token.texts_to_sequences(preprocessed_text_val)
title_sequences_val = token.texts_to_sequences(preprocessed_title_val)
#print(sequences_train)
val=np.hstack(text_sequences_val+title_sequences_val)
val.reshape(1,val.shape[0])
val_seq_x=pad_sequences(val, maxlen=300)
the above code gives me an error
ValueError: `sequences` must be a list of iterables. Found non-iterable: 428.0
code i am using for embedding matrix is
glove_file = "glove.42B.300d.txt"
import tqdm
EMBEDDING_VECTOR_LENGTH = 300 # <=200
def construct_embedding_matrix(glove_file, word_index):
embedding_dict = {}
with open(glove_file,'r', encoding='utf-8') as f:
for line in f:
values=line.split()
# get the word
word=values[0]
if word in word_index.keys():
# get the vector
vector = np.asarray(values[1:], 'float32')
embedding_dict[word] = vector
#print(embedding_dict[word].shape)
### oov words (out of vacabulary words) will be mapped to 0 vectors
num_words=len(word_index)+1
#initialize it to 0
embedding_matrix=np.zeros((num_words, EMBEDDING_VECTOR_LENGTH))
for word,i in tqdm.tqdm(word_index.items()):
if i < num_words:
vect=embedding_dict.get(word, [])
if len(vect)>0:
embedding_matrix[i] = vect[:EMBEDDING_VECTOR_LENGTH]
#print(embedding_matrix[i].shape)
print(embedding_matrix)
return embedding_matrix
embedding_matrix=construct_embedding_matrix(glove_file, word_index)
If I convert text sequences and then train test split it gives an error where X and Y no of samples do not match

can i use OR-tools for TSP with partial distance matrix (for a huge set of nodes)?

i'm trying to solve tsp with OR-tools for a problem of something like 80,000 nodes, the problem is, I need a huge distance matrix that takes to much memory ,so its infeasible and i don't get a solution.
so:
is there an option to work with partial distance matrix in or-tools?
if not is there a way to improve my code?
is there another external solver that can work for this task in python?
import math
from collections import namedtuple
import random
import time
from collections import namedtuple
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import numba
from scipy.spatial import distance_matrix
from sklearn.metrics.pairwise import euclidean_distances
from math import sqrt
Point = namedtuple("Point", ['x', 'y'])
def solve_it(input_data):
# Modify this code to run your optimization algorithm
global POINTS
# parse the input
lines = input_data.split('\n')
nodeCount = int(lines[0])
points = []
for i in range(1, nodeCount+1):
line = lines[i]
parts = line.split()
points.append(Point(float(parts[0]), float(parts[1])))
#2.routing with or tools
def dist_matrix(nodeCount,points):
data=[]
for k in range(len(points)):
data.append([int(points[k].x),int(points[k].y)])
D=euclidean_distances(data, data)
return D
def create_data_model(D):
"""Stores the data for the problem."""
data = {}
data['distance_matrix'] = D # yapf: disable
data['num_vehicles'] = 1
data['depot'] = 0
return data
def print_solution(manager, routing, solution):
index = routing.Start(0)
plan_output = []#Route for vehicle 0:\n'
route_distance = 0
while not routing.IsEnd(index):
plan_output.append(manager.IndexToNode(index))
index = solution.Value(routing.NextVar(index))
return plan_output
def or_main(nodeCount,points):
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
"""Entry point of the program."""
# Instantiate the data problem.
global sol
D=dist_matrix(nodeCount,points)
data = create_data_model(D)
# Create the routing index manager.
manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),
data['num_vehicles'], data['depot'])
# Create Routing Model.
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index):
"""Returns the distance between the two nodes."""
# Convert from routing variable Index to distance matrix NodeIndex.
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return data['distance_matrix'][from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
# Define cost of each arc.
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
# Setting first solution heuristic.
search_parameters = pywrapcp.DefaultRoutingSearchParameters()
search_parameters.local_search_metaheuristic = (
routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
k = 100
if nodeCount <= 100:
k = 30
elif 100 <= nodeCount <= 1000:
k = 300
elif nodeCount > 1000:
k = 17000
search_parameters.time_limit.seconds =k
search_parameters.log_search = True
# Solve the problem.
solution = routing.SolveWithParameters(search_parameters)
# #print solution on console.
if solution:
sol=print_solution(manager, routing, solution)
return sol
######################################################################
solution=or_main(nodeCount,points)
# calculate the length of the tour
obj = length(points[solution[-1]], points[solution[0]])
for index in range(0, nodeCount-1):
obj += length(points[solution[index]], points[solution[index+1]])
# prepare the solution in the specified output format
output_data = '%.2f' % obj + ' ' + str(0) + '\n'
output_data += ' '.join(map(str, solution))
return output_data
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
file_location = sys.argv[1].strip()
with open(file_location, 'r') as input_data_file:
input_data = input_data_file.read()
#print(solve_it(input_data))
else:
print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/tsp_51_1)')

Why wont the NASA pictures display?

The pictures are not displaying. The code executes just fine. I took out the api key.
def gimmePictures(num):
for n in range(0,num):
now = datetime.datetime.now()
day4Pictures= now - datetime.timedelta(days = n)
data = {'api_key':'',
'date':day4Pictures.date()}
print(data)
# using the paramas argument in our request
result = requests.get('https://api.nasa.gov/planetary/apod',params=data)
# create a dictionary for yesterday's picture
dict_day = result.json()
print(dict_day['date'])
Image(dict_day['url'])
gimmePictures(10)
How can I display an image from a file in Jupyter Notebook?
def gimmePictures(num):
listofImageNames=[]
for n in range(0,num):
now = datetime.datetime.now()
day4Pictures= now - datetime.timedelta(days = n)
data = {'api_key':'dcS6cZ9DJ4zt9oXwjF6hgemj38bNJo0IGcvFGZZj', 'date':day4Pictures.date()}
# using the paramas argument in our request
result = requests.get('https://api.nasa.gov/planetary/apod',params=data)
# create a dictionary for yesterday's picture
dict_day = result.json()
listofImageNames.append(dict_day['url'])
for imageName in listofImageNames:
display(Image(imageName))
gimmePictures(10)

How to use the PyPy as the notebook interpreter?

I Have a Script for data extraction from some CSV files and bifurcating the Data into different excel files. I using Ipython for the that and I m sure it using CPython as the Default interpreter.
But the script is taking too much time for the whole process to finish. Can someone please help to how use that script using the PyPy as i heard it is much faster than CPython.
Script is something like this:
import pandas as pd
import xlsxwriter as xw
import csv
import pymsgbox as py
file1 = "vDashOpExel_Change_20150109.csv"
file2 = "vDashOpExel_T3Opened_20150109.csv"
path = "C:\Users\Abhishek\Desktop\Pandas Anlaysis"
def uniq(words):
seen = set()
for word in words:
l = word.lower()
if l in seen:
continue
seen.add(l)
yield word
def files(file_name):
df = pd.read_csv( path + '\\' + file_name, sep=',', encoding = 'utf-16')
final_frame = df.dropna(how='all')
file_list = list(uniq(list(final_frame['DOEClient'])))
return file_list, final_frame
def fill_data(f_list, frame1=None, frame2=None):
if f_list is not None:
for client in f_list:
writer = pd.ExcelWriter(path + '\\' + 'Accounts'+ '\\' + client + '.xlsx', engine='xlsxwriter')
if frame1 is not None:
data1 = frame1[frame1.DOEClient == client] # Filter the Data
data1.to_excel(writer,'Change',index=False, header=True) # Importing the Data to Excel File
if frame2 is not None:
data2 = frame2[frame2.DOEClient == client] # Filter the Data
data2.to_excel(writer,'Opened',index=False, header=True) # Importing the Data to Excel File
else:
py.alert('Please enter the First Parameter !!!', 'Error')
list1, frame1 = files(file1)
list2, frame2 = files(file2)
final_list = set(list1 + list2)