I Have a Script for data extraction from some CSV files and bifurcating the Data into different excel files. I using Ipython for the that and I m sure it using CPython as the Default interpreter.
But the script is taking too much time for the whole process to finish. Can someone please help to how use that script using the PyPy as i heard it is much faster than CPython.
Script is something like this:
import pandas as pd
import xlsxwriter as xw
import csv
import pymsgbox as py
file1 = "vDashOpExel_Change_20150109.csv"
file2 = "vDashOpExel_T3Opened_20150109.csv"
path = "C:\Users\Abhishek\Desktop\Pandas Anlaysis"
def uniq(words):
seen = set()
for word in words:
l = word.lower()
if l in seen:
continue
seen.add(l)
yield word
def files(file_name):
df = pd.read_csv( path + '\\' + file_name, sep=',', encoding = 'utf-16')
final_frame = df.dropna(how='all')
file_list = list(uniq(list(final_frame['DOEClient'])))
return file_list, final_frame
def fill_data(f_list, frame1=None, frame2=None):
if f_list is not None:
for client in f_list:
writer = pd.ExcelWriter(path + '\\' + 'Accounts'+ '\\' + client + '.xlsx', engine='xlsxwriter')
if frame1 is not None:
data1 = frame1[frame1.DOEClient == client] # Filter the Data
data1.to_excel(writer,'Change',index=False, header=True) # Importing the Data to Excel File
if frame2 is not None:
data2 = frame2[frame2.DOEClient == client] # Filter the Data
data2.to_excel(writer,'Opened',index=False, header=True) # Importing the Data to Excel File
else:
py.alert('Please enter the First Parameter !!!', 'Error')
list1, frame1 = files(file1)
list2, frame2 = files(file2)
final_list = set(list1 + list2)
Related
I'm running this flask app
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS, cross_origin
import json
import pandas as pd
# Create the app object
app = Flask(__name__)
cors = CORS(app, resources= {r"/*": {'origins' : "*"}})
# importing function for calculations
from Record_Matching import Matching
#app.route("/query", methods = ['get'])
#cross_origin()
def query():
# service_account_creds = request.json
query1 = request.args.get('query1', type = str)
query2 = request.args.get('query2', type = str)
querycolumns = request.args.get('querycolumns')
project_id = request.args.get('project_id', type = str)
service_account_creds = request.args.get('service_account')
SS = request.args.get('SS', type = float)
TT = request.args.get('TT', type = float)
result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns)
return result
if __name__ == "__main__":
app.run(host="localhost", port=8080, debug=True)
and I'm importing the matching function from this python scripts
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import recordlinkage
from recordlinkage.preprocessing import phonetic
from pandas.io.json import json_normalize
import uuid
from uuid import uuid4
import random
import string
import json
import ast
# Results to data frame function
def gcp2df(sql, client):
query = client.query(sql)
results = query.result()
return results.to_dataframe()
# Exporting df to bigquery - table parameter example: "dataset.tablename"
# def insert(df, table):
# client = bigquery.Client()
# job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
# return client.load_table_from_dataframe(df, table, job_config = job_config)
def pair(df1, df2, TT, querycolumns):
# function to take pair from list and compare:
L = querycolumns
l=len(querycolumns)
p1=0
p2=1
# To generate phonetics we need to make sure all names are in english.
# thus we'll replace non-english words by random english strings
df1[L[p1]] = df1[L[p1]].astype(str)
df2[L[p2]] = df2[L[p2]].astype(str)
for i in range(0,len(df1)):
if df1[L[p1]][i].isascii() == False:
df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
for i in range(0,len(df2)):
if df2[L[p2]][i].isascii() == False:
df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
compare = recordlinkage.Compare()
df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex")
df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex")
df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1])
df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1])
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(df1, df2)
compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name")
# O(n) a function that uses two pointers to track consecutive pairs for the input list
while p2 <=l:
compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1])
p1+=2
p2+=2
features = compare.compute(candidate_links,df1, df2)
return features
def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns):
service_account_creds = ast.literal_eval(service_account_creds)
credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'],
service_account_creds['token_uri'])
job_config = bigquery.LoadJobConfig()
client = bigquery.Client( project = project_id)
SS=int(SS)
TT=float(TT)
df1 = gcp2df("""{}""".format(query1), client)
df2 = gcp2df("""{}""".format(query2), client)
querycolumns = json.loads(querycolumns)
querycolumns = list(querycolumns.values())
features = pair(df1, df2, TT, querycolumns)
features['Similarity_score'] = features.sum(axis=1)
features = features[features['Similarity_score']>=SS].reset_index()
final = features[['level_0', 'level_1']]
final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True)
final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))]
final['Unique_ID'] = final['Unique_ID'].astype(str)
final['Similarity_Score'] = SS
final_duplicates = final['df1_index'].value_counts().max()
# insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices")
message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates)
return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')}
I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app?
I tried to_dict('record') to convert a dataframe to a dictionary,
it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.
i'm trying to solve tsp with OR-tools for a problem of something like 80,000 nodes, the problem is, I need a huge distance matrix that takes to much memory ,so its infeasible and i don't get a solution.
so:
is there an option to work with partial distance matrix in or-tools?
if not is there a way to improve my code?
is there another external solver that can work for this task in python?
import math
from collections import namedtuple
import random
import time
from collections import namedtuple
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import numba
from scipy.spatial import distance_matrix
from sklearn.metrics.pairwise import euclidean_distances
from math import sqrt
Point = namedtuple("Point", ['x', 'y'])
def solve_it(input_data):
# Modify this code to run your optimization algorithm
global POINTS
# parse the input
lines = input_data.split('\n')
nodeCount = int(lines[0])
points = []
for i in range(1, nodeCount+1):
line = lines[i]
parts = line.split()
points.append(Point(float(parts[0]), float(parts[1])))
#2.routing with or tools
def dist_matrix(nodeCount,points):
data=[]
for k in range(len(points)):
data.append([int(points[k].x),int(points[k].y)])
D=euclidean_distances(data, data)
return D
def create_data_model(D):
"""Stores the data for the problem."""
data = {}
data['distance_matrix'] = D # yapf: disable
data['num_vehicles'] = 1
data['depot'] = 0
return data
def print_solution(manager, routing, solution):
index = routing.Start(0)
plan_output = []#Route for vehicle 0:\n'
route_distance = 0
while not routing.IsEnd(index):
plan_output.append(manager.IndexToNode(index))
index = solution.Value(routing.NextVar(index))
return plan_output
def or_main(nodeCount,points):
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
"""Entry point of the program."""
# Instantiate the data problem.
global sol
D=dist_matrix(nodeCount,points)
data = create_data_model(D)
# Create the routing index manager.
manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),
data['num_vehicles'], data['depot'])
# Create Routing Model.
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index):
"""Returns the distance between the two nodes."""
# Convert from routing variable Index to distance matrix NodeIndex.
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return data['distance_matrix'][from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
# Define cost of each arc.
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
# Setting first solution heuristic.
search_parameters = pywrapcp.DefaultRoutingSearchParameters()
search_parameters.local_search_metaheuristic = (
routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
k = 100
if nodeCount <= 100:
k = 30
elif 100 <= nodeCount <= 1000:
k = 300
elif nodeCount > 1000:
k = 17000
search_parameters.time_limit.seconds =k
search_parameters.log_search = True
# Solve the problem.
solution = routing.SolveWithParameters(search_parameters)
# #print solution on console.
if solution:
sol=print_solution(manager, routing, solution)
return sol
######################################################################
solution=or_main(nodeCount,points)
# calculate the length of the tour
obj = length(points[solution[-1]], points[solution[0]])
for index in range(0, nodeCount-1):
obj += length(points[solution[index]], points[solution[index+1]])
# prepare the solution in the specified output format
output_data = '%.2f' % obj + ' ' + str(0) + '\n'
output_data += ' '.join(map(str, solution))
return output_data
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
file_location = sys.argv[1].strip()
with open(file_location, 'r') as input_data_file:
input_data = input_data_file.read()
#print(solve_it(input_data))
else:
print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/tsp_51_1)')
i try to analysis the Principle Component from cvs file but when i run the code i get this error
C:\Users\Lenovo\Desktop>python pca.py
ValueError: could not convert string to float: Annee;NET;INT;SUB;LMT;DCT;IMM;EXP;VRD
this is my cvs file
i try to remove any space and any think
this is my python script, i don't know what i miss
Note: i run this code under python2.7
from sklearn.externals import joblib
import numpy as np
import glob
import os
import time
import numpy
my_matrix = numpy.loadtxt(open("pca.csv","rb"),delimiter= ",",skiprows=0)
def pca(dataMat, r, autoset_r=False, autoset_rate=0.9):
"""
purpose: principal components analysis
"""
print("Start to do PCA...")
t1 = time.time()
meanVal = np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVal
# normData = meanRemoved / np.std(dataMat)
covMat = np.cov(meanRemoved, rowvar=0)
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
eigValIndex = np.argsort(-eigVals)
if autoset_r:
r = autoset_eigNum(eigVals, autoset_rate)
print("autoset: take top {} of {} features".format(r, meanRemoved.shape[1]))
r_eigValIndex = eigValIndex[:r]
r_eigVect = eigVects[:, r_eigValIndex]
lowDDataMat = meanRemoved * r_eigVect
reconMat = (lowDDataMat * r_eigVect.T) + meanVal
t2 = time.time()
print("PCA takes %f seconds" %(t2-t1))
joblib.dump(r_eigVect, './pca_args_save/r_eigVect.eig')
joblib.dump(meanVal, './pca_args_save/meanVal.mean')
return lowDDataMat, reconMat
def autoset_eigNum(eigValues, rate=0.99):
eigValues_sorted = sorted(eigValues, reverse=True)
eigVals_total = eigValues.sum()
for i in range(1, len(eigValues_sorted)+1):
eigVals_sum = sum(eigValues_sorted[:i])
if eigVals_sum / eigVals_total >= rate:
break
return i
It seemed that NumPy has some problem parsing your index row to float.
Try setting skiprows = 1 in your np.readtxt command in order to skip the table header.
I need to create my own handwritten character dataset the format is just like the Iam Handwriting Database. I don't know how to create the dataset just like that, and I need you can check the data set format from their site I need to create data/ascii/words.txt and data/words/
There isn't instruction for creating IAM Handwriting Database. But you can find here: Build a Handwritten Text Recognition System using TensorFlow.
import os
import numpy as np
import cv2
class DataProvider():
"this class creates machine-written text for a word list. TODO: change getNext() to return your samples."
def __init__(self, wordList):
self.wordList = wordList
self.idx = 0
def hasNext(self):
return self.idx < len(self.wordList)
def getNext(self):
img = np.ones((32, 128), np.uint8)*255
word = self.wordList[self.idx]
self.idx += 1
cv2.putText(img,word,(2,20), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0), 1, cv2.LINE_AA)
return (word, img)
def createIAMCompatibleDataset(dataProvider):
"this function converts the passed dataset to an IAM compatible dataset"
# create files and directories
f = open('words.txt', 'w+')
if not os.path.exists('sub'):
os.makedirs('sub')
if not os.path.exists('sub/sub-sub'):
os.makedirs('sub/sub-sub')
# go through data and convert it to IAM format
ctr = 0
while dataProvider.hasNext():
sample = dataProvider.getNext()
# write img
cv2.imwrite('sub/sub-sub/sub-sub-%d.png'%ctr, sample[1])
# write filename, dummy-values and text
line = 'sub-sub-%d'%ctr + ' X X X X X X X ' + sample[0] + '\n'
f.write(line)
ctr += 1
if __name__ == '__main__':
words = ['some', 'words', 'for', 'which', 'we', 'create', 'text-images']
dataProvider = DataProvider(words)
createIAMCompatibleDataset(dataProvider)
The source code made by Harald Scheidl.
I want to know if it is possible to import data of attitude and position (roll/pitch/yaw & xyz) from a comma separated file to Blender?
I recorded data from a little RC car and I want to represent its movement in a 3D world.
I have timestamps too, so if there's a way to animated the movement of the object it'll be superb!!
Any help will be greatly appreciated!!
Best Regards.
A slight modifcation, making use of the csv module
import bpy
import csv
position_vectors = []
filepath = "C:\\Work\\position.log"
csvfile = open(filepath, 'r', newline='')
ofile = csv.reader(csvfile, delimiter=',')
for row in ofile:
position_vectors.append(tuple([float(i) for i in row]))
csvfile.close()
This will get your points into Blender. Note the delimiter parameter in csv.reader, change that accordingly. With a real example file of your RC car we could provide a more complete solution.
For blender v2.62:
If you have a file "positions.log" looking like:
-8.691985196313894e-002; 4.119284642631801e-001; -5.832147659661263e-001
1.037146774956164e+000; 8.137243553005405e-002; -5.703274929662892e-001
-3.602584527944123e-001; 8.378614512537046e-001; 2.615265921163826e-001
6.266465707681335e-001; -1.128416901202341e+000; -1.664644365541639e+000
3.327523280880091e-001; 4.488553740582839e-001; -2.449449085462368e+000
-7.311567199869298e-001; -1.860587923723032e+000; -1.297179602213110e+000
-7.453603745688361e-003; 4.770473577895327e-001; -2.319515785100494e+000
1.935170866863264e-001; -2.010280476717868e+000; 3.748000986190077e-001
5.201529166915653e-001; 3.952972788761738e-001; 1.658581747430548e+000
4.719198263774027e-001; 1.526020825619557e+000; 3.187088567866725e-002
you can read it with this python script in blender (watch out for the indentation!)
import bpy
from mathutils import *
from math import *
from bpy.props import *
import os
import time
# Init
position_vector = []
# Open file
file = open("C:\\Work\\position.log", "r")
# Loop over line in file
for line in file:
# Split line at ";"
splittet_line = line.split(";")
# Append new postion
position_vector.append(
Vector((float(splittet_line[0]),
float(splittet_line[1]),
float(splittet_line[2]))))
# Close file
file.close()
# Get first selected object
selected_object = bpy.context.selected_objects[0]
# Get first selected object
for position in position_vector:
selected_object.location = position
This reads the file and updates the position of the first selected object accordingly. Way forward: What you have to find out is how to set the keyframes for the animation...
Consider this python snippet to add to the solutions above
obj = bpy.context.object
temporalScale=bpy.context.scene.render.fps
for lrt in locRotArray:
obj.location = (lrt[0], lrt[1], lrt[2])
# radians, and do you want XYZ, or ZYX?
obj.rotation_euler = (lrt[3], lrt[4], lrt[5])
time = lrt[6]*temporalScale
obj.keyframe_insert(data_path="location", frame=time)
obj.keyframe_insert(data_path="rotation_euler", frame=time)
I haven't tested it, but it will probably work, and gets you started.
With a spice2xyzv file as input file. The script writed by "Mutant Bob" seems to work.
But the xyz velocity data are km/s not euler angles, I think, and the import does not work for the angles.
# Records are <jd> <x> <y> <z> <vel x> <vel y> <vel z>
# Time is a TDB Julian date
# Position in km
# Velocity in km/sec
2456921.49775 213928288.518 -446198013.001 -55595492.9135 6.9011736 15.130842 0.54325805
Is there a solution to get them in Blender? Should I convert velocity angle to euler, is that possible in fact?
I use this script :
import bpy
from mathutils import *
from math import *
from bpy.props import *
import os
import time
# Init
position_vector = []
# Open file
file = open("D:\\spice2xyzv\\export.xyzv", "r")
obj = bpy.context.object
temporalScale=bpy.context.scene.render.fps
for line in file:
# Split line at ";"
print("line = %s" % line)
line = line.replace("\n","")
locRotArray = line.split(" ")
print("locRotArray = %s" % locRotArray )
#for lrt in locRotArray:
print(locRotArray[1])
obj.location = (float(locRotArray[1]), float(locRotArray[2]), float(locRotArray[3]))
# radians, and do you want XYZ, or ZYX?
obj.rotation_euler = (float(locRotArray[4]), float(locRotArray[5]), float(locRotArray[5]))
time = float(locRotArray[0])*temporalScale
print("time = %s" % time)
obj.keyframe_insert(data_path="location", frame=time)
obj.keyframe_insert(data_path="rotation_euler", frame=time)