OCR in the background while displaying findings in GUI - pandas

I am trying to run a OCR function in the background while displaying the findings in a GUI.
The OCR is working fine, but I can't seem to get the GUI started.
I think the issues is that there is not function to start the GUI, but I have not would a solution.
import time
import cv2
import mss
import numpy
import pytesseract
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import PySimpleGUI as sg
import os
import threading
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
def ocr_function():
with mss.mss() as mss_instance:
mon = mss_instance.monitors[0]
screenshot = mss_instance.grab(mon) #Read all monitor(s)
with mss.mss() as sct:
while True:
im = numpy.asarray(sct.grab(mon))
plt.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
text = pytesseract.image_to_string(im) #image to text
time.sleep(5) #One screenshot per 5 seconds
os.system('cls') #Clear output
print(text) #Print the output text
return text #Return text to function
#plt.show()
Output = ocr_function
t = threading.Thread(target=ocr_function) #Create a thread for the OCR function
t.start() #Start the OCR thread
Output = df.sort_values(by=['Match_Acc.','D-level', 'R-level'], ascending=[False, False, False]) # Sort columes
Output = Output[Output['Match_Acc.'] >= 1]
font = ('Areal', 11)
sg.theme('BrownBlue')
data = Output
headings = ['Result', 'Column1','Column2','Column3','D-level','R-level','n_matches','nan','nonnan','Match_Acc.']
df = pd.DataFrame(data)
headings = df.columns.tolist()
data = df.values.tolist()
layout = [[sg.Table(data, headings=headings, justification='left', key='-TABLE-')],
[sg.Button('Run'), sg.Button('Exit')]]
sg.Window("Overview", layout).read(close=True)

Related

how to plot labels TFRecords in histogram

Hello i have many files of TFRecords. i use python tensorflow and want to plot in one histogram all labels.
TFRecords is pair of (image,label)
so how i can extract all the labels ?
i have try to extract labels and have success plot several batches
all_label = []
for image, label in ds_train.take(10):
all_label.append(label)
sns.distplot(all_label)
Maybe something like this.
import re
#import pdftotext
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
with open('C:\\Users\\Finance10K.txt') as f:
clean_cont = f.read().splitlines()
clean_cont
doc=[i.replace('\xe2\x80\x9c','') for i in clean_cont ]
doc=[i.replace('\xe2\x80\x9d','') for i in doc ]
doc=[i.replace('\xe2\x80\x99s','') for i in doc ]
docs = [x for x in doc if x != ' ']
docss = [x for x in docs if x != '']
doc
docs
docss
financedoc=[re.sub("[^a-zA-Z]+", " ", s) for s in docss]
financedoc
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
#%pylab
#%matplotlib inline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
vect=CountVectorizer(ngram_range=(1,1),stop_words='english')
fin=vect.fit_transform(financedoc)
fin
pd.DataFrame(fin.toarray(),columns=vect.get_feature_names())
lda=LatentDirichletAllocation(n_components=5)
lda.fit_transform(fin)
lda_dtf=lda.fit_transform(fin)
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())
import mglearn
mglearn.tools.print_topics(topics=range(5), feature_names=features,
sorting=sorting, topics_per_chunk=5, n_words=10)
#from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
zit=pyLDAvis.sklearn.prepare(lda,fin,vect)
pyLDAvis.show(zit)

How to wrap text in a dataframe's table (converted to .png)

I am having an issue where I cannot format my tables. The text is too long to just edit the dimensions or the text size. How can I quickly change this so you can see all the text when I have the data for each column more filled in? I am looking for a wrap text kind of function but I don't know if that is possible the way I'm doing it. Is there another way you'd recommend? I'm changing the table into a .png to insert into an Excel file. It has to be a .png so it's an object and doesn't mess with the size of the rows and columns in Excel.
import matplotlib.pyplot as plt
import xlsxwriter as xl
import numpy as np
import yfinance as yf
import pandas as pd
import datetime as dt
import mplfinance as mpf
import pandas_datareader
from pandas_datareader import data as pdr
yf.pdr_override()
import numpy as np
Individualreport = "C:\\Users\\Ashley\\FromPython.xlsx"
Ticklist = pd.read_excel("C:\\Users\\Ashley\\Eyes Trial Data Center.xlsx",sheet_name='Tickers', header=None)
stocks = Ticklist.values.ravel()
PipelineData = pd.read_excel("C:\\Users\\Ashley\\Eyes Trial Data Center.xlsx", sheet_name='Pipeline', header=None)
writer = pd.ExcelWriter(Individualreport, engine='xlsxwriter')
for i in stocks:
#write pipeline data
t = PipelineData.loc[(PipelineData[0]==i)]
print(t)
def render_mpl_table(data, col_width=10, row_height=1, font_size=10, wrap=True,
header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
bbox=[0, 0, 1, 1], header_columns=0,
ax=None, **kwargs):
if ax is None:
size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
fig, ax = plt.subplots(figsize=size)
ax.axis('off')
mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)
mpl_table.auto_set_font_size(False)
#mpl_table.set_fontsize(font_size)
for k, cell in mpl_table._cells.items():
cell.set_edgecolor(edge_color)
if k[0] == 0 or k[1] < header_columns:
cell.set_text_props(weight='bold', color='w')
cell.set_facecolor(header_color)
else:
cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
return ax.get_figure(), ax
fig,ax = render_mpl_table(t, header_columns=0, col_width=2.0)
fig.savefig(str(i)+'pipe.png')
I think I needed to use an additional package, haven't tried with this example, but worked in another similar example I did.
from textwrap import wrap
label = ("label text that is getting put in the graph")
label = [ '\n'.join(wrap(l, 20)) for l in label ]
#20 is number of characters per line

Accessing methods within a class from bokeh FileInput widget

I am working on a Bokeh serve UI and am running into trouble interfacing a class (and its methods) with the FileInput widget. I am using a class (in this example, called "EIS_data") which, when instantiated, loads a file using pd.read_csv. The EIS_data class also has a method to plot the data in a particular way, and I'd like to be able to load the pandas dataframe and call and manipulate the data using the methods already in place in the class.
So far, I have been able to load the data successfully using the FileInput widget, but I can't figure out how to access the dataframe again once it's loaded in. In a standalone Jupyter notebook, I could run d = EIS_data("filename") and then ```d.plot''' to load the data into a pandas dataframe and then plot it according to the method defined in the EIS_data class, but I can't figure out how to replicate this in the UI code once the data are loaded using the FileInput widget.
Is there a way I can interface this with Bokeh widgets, such that I could simply add d.plot() to curdoc()? I have found a workaround using ColumnDataSource, but it seems a shame to redefine plotting methods and data handling when they are already defined in the class. Below are minimal working examples of the UI code and the class definition.
UI Code:
import numpy as np
import pandas as pd
from eis_analysis_trimmed import EIS_data
import bokeh
from bokeh.io import curdoc
from bokeh import layouts
from bokeh.layouts import column,row,gridplot
from bokeh.plotting import figure
from bokeh.models import *
import base64
import io
## Instantiate the EIS_data class for loading data
def load_data(f):
return EIS_data(f)
## updater function called to load data with FileInput widget
## Must be decoded using base64
def load_file(attr, old, new):
decoded = base64.b64decode(new)
d = io.BytesIO(decoded)
dat = load_data(d)
print(dat.df)
print(dat)
print("EIS Data Uploaded Successfully")
return dat
f_load = Paragraph(text="""Load Data""",height=15)
f = FileInput()
f.on_change('value',load_file)
curdoc().add_root(column(f))
and here is the EIS_data class:
import numpy as np
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
from bokeh.plotting import figure, show
from bokeh.models import LinearAxis, Range1d
from bokeh.resources import INLINE
import bokeh.io
#locally include javascript dependencies in html
bokeh.io.output_notebook(INLINE)
class EIS_data:
def __init__(self, file_name, delimiter='\t',
header=0, f_low=None, f_high=None):
#load eis data into a pandas dataframe
eis_data = pd.read_csv(file_name, delimiter=delimiter, header=header)
#iterate through all of the columns and check to see
#if all of the values in that column are null
#if they are, then remove that column
for c in eis_data.columns:
if eis_data[c].isnull().all():
eis_data = eis_data.drop([c], axis=1)
#make sure that the data are imported as floats and not strings
eis_data = eis_data[['freq/Hz', 'Re(Z)/Ohm', '-Im(Z)/Ohm']]
eis_data['freq/Hz'] = pd.to_numeric(eis_data['freq/Hz'])
eis_data['Re(Z)/Ohm'] = pd.to_numeric(eis_data['Re(Z)/Ohm'])
eis_data['-Im(Z)/Ohm'] = pd.to_numeric(eis_data['-Im(Z)/Ohm'])
self.df = eis_data.sort_values(by='freq/Hz')
def plot(self, fit_vals = None):
plot = figure(title="Nyquist Plot",
x_axis_label='Re(Z) Ohm',
y_axis_label='-Im(Z) Ohm',
plot_width=600,
plot_height=600)
plot.circle(self.df['Re(Z)/Ohm'], self.df['-Im(Z)/Ohm'],
size=7, color='navy', name='Data')
return plot
EDIT: Adding the workaround using ColumnDataSource
from bokeh.layouts import column
from bokeh.plotting import figure
from bokeh.models import *
from bokeh.models.widgets import FileInput
import base64
import io
from eis_analysis2 import EIS_data
# Instantiate the EIS_data class for loading data
def load_data(data):
return EIS_data(data)
# updater function called to load data with FileInput widget
# Must be decoded using base64
def load_file(attr, old, new):
decoded = base64.b64decode(new)
d = io.BytesIO(decoded)
dat = load_data(d)
dat_df = dat.df
# Replace plot data with data from newly-loaded file
source.data = dict(freq=dat_df[dat_df.columns[0]], reZ=dat_df[dat_df.columns[1]], imZ=dat_df[dat_df.columns[2]])
#phase,mag = bode_calc(reZ,imZ)
print(dat_df)
print("EIS Data Uploaded Successfully")
# Create Column Data Source that will be used by the plot
source = ColumnDataSource(data=dict(freq=[], reZ=[], imZ=[]))
##Make the nyquist plot
nyq_plot = figure(title="Nyquist Plot",
x_axis_label='Re(Z) Ohm',
y_axis_label='-Im(Z) Ohm',
plot_width=600,
plot_height=600)
nyq_plot.circle(x="reZ", y="imZ",source=source,size=7, color='navy', name='Data')
f = FileInput()
f.on_change('value', load_file)
layout = column(f, nyq_plot)
curdoc().add_root(layout)

Blank Bokeh plot when reading data from dataframe and using time on x-axis

I am unable to display plot using Bokeh. I am reading the data from dataframe. Here's a snippet of my Python code.
I am new to Bokeh. I tried following some of the examples from the User Guide. I'm unable to figure out what's going wrong here. Please advise.
import datetime
import pandas
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.models import ColumnDataSource
PATH_TO_CSV = "Sample_Data.csv"
output_notebook()
data = pd.read_csv(PATH_TO_CSV, index_col=False)
data['timestamp'] = pd.to_datetime(data['timestamp']).dt.strftime("%H:%M:%S")
source = ColumnDataSource(data)
p = figure(plot_width=400, plot_height=400, x_axis_type="datetime")
p.line('timestamp', 'event_msg', source=source)
show(p)
Here's sample .csv
event_msg,timestamp
Created,2019-03-02 13:19:44.164562-0700
Created,2019-03-02 13:20:32.212323-0700
Created,2019-03-02 13:20:56.582761-0700
Modified,2019-03-02 13:21:48.021752-0700
Deleted,2019-03-02 13:22:16.938382-0700
Modified,2019-03-02 13:22:22.139714-0700
Permission changed,2019-03-02 13:24:20.195975-0700
Deleted,2019-03-02 13:33:53.049900-0700
Modified,2019-03-02 13:33:56.266113-0700
Deleted,2019-03-02 13:33:59.757584-0700
I am seeing completely blank plot. Ideally, I am interested in plotting different line plots based on the event messages.
You should convert your time like this:
data['timestamp'] = pd.to_datetime(data['timestamp'])
So your code should look like (tested with Bokeh v1.1.0):
import os
import datetime
import pandas as pd
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.models import ColumnDataSource
PATH_TO_CSV = "Sample_Data.csv"
output_notebook()
data = pd.read_csv(os.path.join(os.path.dirname(__file__), PATH_TO_CSV), index_col = False)
data['timestamp'] = pd.to_datetime(data['timestamp'])
source = ColumnDataSource(data)
p = figure(plot_width = 400, plot_height = 400, x_axis_type = "datetime", y_range = data['event_msg'].unique())
p.line('timestamp', 'event_msg', source = source)
show(p)
Result:

How to implement QThread correctly with matplotlib and pyplot

I understand that there have been one or two other questions posted that are related but not exactly what I need. I'm building this gui that activates a module by clicking a button. This python module that gets activated by pushing the button generates heatmaps from multiple pandas dataframes and saves those images, which in turn is then saved into an xlsx using pandas ExcelWriter.
I've tried to implement QThread, as other stackoverflow examples tried to explain similar problems but I continue getting this error: "It is not safe to use pixmaps outside the GUI thread". I understand that technically I'm not creating the heatmap inside the MAIN gui thread but I thought with QThread that I am still inside "a" gui thread. These dataframes that the heatmaps are based off of can be of a large size at times and I am somewhat grasping the concept of sending a signal to the main gui thread when a heatmap is to be created and have the heatmap function inside the main gui class...but I fear that will be troublesome later in passing so much data around..this is more like pipelining than threading. I just want this working thread to create these images and save them and then take those saved files and save them into an xlsx without interrupting the main gui..
(NOTE: This is a simplified version, in the real program there will be several of these threads created almost simultaneously and inside each thread several heatmaps will be created)
---main.py---
import sys
from MAIN_GUI import *
from PyQt4 import QtGui, QtCore
from excel_dummy import *
if __name__=="__main__":
app = QtGui.QApplication(sys.argv)
class MAIN_GUI(QtGui.QMainWindow):
def __init__(self):
super(MAIN_GUI, self).__init__()
self.uiM = Ui_MainWindow()
self.uiM.setupUi(self)
self.connect(self.uiM.updateALL_Button,QtCore.SIGNAL('clicked()'),self.newThread)
def newThread(self):
Excelify = excelify()
Excelify.start()
self.connect(Excelify,QtCore.SIGNAL('donethread(QString)'),(self.done))
def done(self):
print('done')
main_gui = MAIN_GUI()
main_gui.show()
main_gui.raise_()
sys.exit(app.exec_())
---excel_dummy.py---
import os, pandas as pd
from pandas import ExcelWriter
import numpy as np
import seaborn.matrix as sm
from PyQt4 import QtCore
from PyQt4.QtCore import QThread
from matplotlib.backends.backend_agg import FigureCanvas
from matplotlib.figure import Figure
import time
class excelify(QThread):
def __init__(self):
QThread.__init__(self)
def run(self):
path = 'home/desktop/produced_files'
with ExcelWriter(path + '/final.xlsx', engine='xlsxwriter') as writer:
workbook = writer.book
worksheet = workbook.add_worksheet()
heatit = self.heatmap()
worksheet.insert_image('C3',path + '/' + 'heat.jpg')
worksheet.write(2, 2, 'just write something')
writer.save()
print('file size: %s "%s"' % (os.stat(path).st_size, path))
time.slee(0.3)
self.emit(QtCore.SIGNAL('donethread(QString)'),'')
def heatmap(self):
df = pd.DataFrame(np.array([[1,22222,33333],[2,44444,55555],[3,44444,22222],[4,55555,33333]]),columns=['hour','in','out'])
dfu = pd.DataFrame(df.groupby([df.in,df.hour]).size())
dfu.reset_index(inplace=True)
dfu.rename(columns={'0':'Count'})
dfu.columns=['in','hour','Count']
dfu_2 = dfu.copy()
mask=0
fig = Figure()
ax = fig.add_subplot(1,1,1)
canvas = FigureCanvas(fig)
df_heatmap = dfu_2.pivot('in','hour','Count').fillna(0)
sm.heatmap(df_heatmap,ax=ax,square=True,annot=False,mask=mask)
fig.savefig(path + '/' + heat.jpg')
---MAIN_GUI.py---
from PyQt4 import QtCore,QtGui
try:
_fromUtf8 = QtCore.QString.fromUtf8
except AttributeError:
def _fromUtf8(s):
return s
try:
_encoding = QtGui.QApplication.unicodeUTF8
def _translate(context, text, disambig):
return QtGui.QApplication.translate(context, text, disambig, _encoding)
except AttributeError:
def _translate(context, text, disambig):
return QtGui.QApplication.translate(context, text, disambig)
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
MainWindow.setObjectName(_fromUtf8("MainWindow"))
MainWindow.resize(320,201)
self.centralwidget = QtGui.QWidget(MainWindow)
self.centralwidget.setObjectName(_fromUtf8("centralwidget"))
self.updateALL_Button = QtGui.QPushButton(self.centralwidget)
self.updateALL_Button.setGeometry(QtCore.QRect(40,110,161,27))
self.updateALL_Button.setFocusPolicy(QtCore.Qt.NoFocus)
self.updateALL_Button.setObjectName(_fromUtf8("Options_updateALL_Button"))
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QtGui.QMenuBar(MainWindow)
self.menubar.setGeometry(QtCore.QRect(0, 0, 320, 24))
self.menubar.setObjectName(_fromUtf8("menubar"))
MainWindow.setMenuBar(self.menubar)
self.statusbar = QtGui.QStatusBar(MainWindow)
self.statusbar.setObjectName(_fromUtf8("statusbar"))
MainWindow.setStatusBar(self.statusbar)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
def retranslateUi(self,MainWindow):
MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow", None))
self.updateALL_Button.setText(_translate("MainWindow", "updateALL", None))
Even though you are explicitely using the Agg backend to generate your figure, it looks like Seaborn is still using the default backend on your system, which is most likely Qt4Agg, an interactive backend. We want Seaborn to use a non-interactive backend instead to avoid any error (see matplotlib documentation for more details about backends). To do so, tell Matplotlib in your imports to use the Agg backend and import Seaborn after Matplotlib.
You will also need to save your figure as a png, since jpg is not supported by the Agg backend. Unless you have some specific reasons for using jpg, png is usually a better format for graphs.
Finally, you could use a memory buffer instead of saving your images to a temporary file before saving them in an Excel Workbook. I haven't tested it, but it will probably be faster if you are working with large files.
Below is a MWE I've written which includes the aformentioned points and which does not give any error on my system in Python3.4:
import pandas as pd
import time
from pandas import ExcelWriter
import numpy as np
from PyQt4 import QtCore, QtGui
import matplotlib as mpl
mpl.use('Agg')
from matplotlib.backends.backend_agg import FigureCanvas
import seaborn.matrix as sm
try: # Python 2 (not tested)
from cStringIO import StringIO as BytesIO
except ImportError: # Python 3
from io import BytesIO
class MAIN_GUI(QtGui.QWidget):
def __init__(self):
super(MAIN_GUI, self).__init__()
self.worker = Excelify()
btn = QtGui.QPushButton('Run')
disp = QtGui.QLabel()
self.setLayout(QtGui.QGridLayout())
self.layout().addWidget(btn, 0, 0)
self.layout().addWidget(disp, 2, 0)
self.layout().setRowStretch(1, 100)
btn.clicked.connect(self.worker.start)
self.worker.figSaved.connect(disp.setText)
class Excelify(QtCore.QThread):
figSaved = QtCore.pyqtSignal(str)
def run(self):
self.figSaved.emit('Saving figure to Workbook.')
t1 = time.clock()
image_data = self.heatmap()
with ExcelWriter('final.xlsx', engine='xlsxwriter') as writer:
wb = writer.book
ws = wb.add_worksheet()
ws.insert_image('C3', 'heat.png', {'image_data': image_data})
writer.save()
t2 = time.clock()
self.figSaved.emit('Done in %f sec.' % (t2-t1))
def heatmap(self):
df = pd.DataFrame(np.array([[1, 22222, 33333], [2, 44444, 55555],
[3, 44444, 22222], [4, 55555, 33333]]),
columns=['hour', 'in', 'out'])
dfu = pd.DataFrame(df.groupby([df.out, df.hour]).size())
dfu.reset_index(inplace=True)
dfu.rename(columns={'0': 'Count'})
dfu.columns = ['in', 'hour', 'Count']
fig = mpl.figure.Figure()
fig.set_canvas(FigureCanvas(fig))
ax = fig.add_subplot(111)
df_heatmap = dfu.pivot('in', 'hour', 'Count').fillna(0)
sm.heatmap(df_heatmap, ax=ax, square=True, annot=False, mask=0)
buf= BytesIO()
fig.savefig(buf, format='png')
return(buf)
if __name__ == '__main__':
import sys
app = QtGui.QApplication(sys.argv)
w = MAIN_GUI()
w.show()
w.setFixedSize(200, 100)
sys.exit(app.exec_())