proximity search on different rows - pandas

For data that indexed from dataframe like this:
import json
mycolumns = ['name']
df = pd.DataFrame(columns=mycolumns)
rows = [["John Abraham"],["Lincoln Smith"]]
for row in rows:
df.loc[len(df)] = row
print(df)
jsons = json.loads(df.to_json(orient='records'))
n = 0
for j in jsons:
j['injection_timestamp'] = pd.to_datetime('now')
es.index(index="prox", doc_type='record', body=j)
if n%1000==0:
print (n/1000),
n+=1
I am trying to search match_phrase that is spread on two rows as described here:
https://www.elastic.co/guide/en/elasticsearch/guide/current/_multivalue_fields_2.html#_multivalue_fields_2
es.search(index="prox", body={"query": {"match_phrase":{"name": "Abraham Lincoln"}}})
I expected to get 1 hit because of the ways how arrays are indexed but I don't get any hit.

Related

Creating pandas columns with for loop

I have the following dataframe created through the following chunk of code:
df = pd.DataFrame(
[
(13412339, '07/03/2022', '08/03/2022', '10/03/2022', 1),
(13412343, '07/03/2022', '07/03/2022', '09/03/2022', 0),
(13412489, '07/02/2022', '08/02/2022', '07/03/2022', 0),
],
columns=['task_id', 'start_date', 'end_date', 'end_period', 'status']
)
df = df.astype(dtype={'status' : bool})
df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)
df.end_period = pd.to_datetime(df.end_period)
What I need to do here is to calculate the difference in days between the start_date and end_date columns if the status column is False, else it should do the same but between start_date and end_period columns.
The code that I have implemented to calculate the days differences between the start_date and end_date columns is as follows:
new_frame = pd.DataFrame()
for row in range(df.shape[0]):
#extract the row
extracted_row = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff = extracted_row['end_date'] - extracted_row['start_date']
diff_days = diff.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days+1):
new_row = extracted_row.copy()
new_row['date'] = new_row['start_date'] + dt.timedelta(days=i)
new_row = new_row[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame = new_frame.append(new_row)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
Then in order to calculate the differences if the status column is False, I did the following one:
new_frame1 = pd.DataFrame()
new_frame2 = pd.DataFrame()
for row in range(df.shape[0]):
#In this iteration, status column should be equals True
if df['status'] == False:
#extract the row
extracted_row_end = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff1 = extracted_row_end['end_date'] - extracted_row_end['start_date']
diff_days_end = diff1.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days_end+1):
new_row_end = extracted_row_end.copy()
new_row_end['date'] = new_row_end['start_date'] + dt.timedelta(days=i)
new_row_end = new_row_end[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame1 = new_frame1.append(new_row_end)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
#In this iteration, status column should be equals False
else:
#extract the row
extracted_row_period = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff2 = extracted_row_period['end_period'] - extracted_row_period['start_date']
diff_days_period = diff2.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days_period+1):
new_row_period = extracted_row_end.copy()
new_row_period['date'] = new_row_period['start_date'] + dt.timedelta(days=i)
new_row_period = new_row_period[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame2 = new_frame2.append(new_row_period)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
#Merges both dataframes
frames = [new_frame1,new_frame2]
df = pd.concat(frames)
Then it throws an error when starts the first for loop, here is where I should be asking help on how to calculate the difference in days between the start_date and end_date columns if the status column is False, else calculate it between start_date and end_period columns.
The complete error is as follows:
Some part of your code did not work on my machine (so I just took the initial df from your first cell) - but when reading what you need, this is what I would do
import numpy as np
df['dayDiff']=np.where(df['status'],(df['end_period']-df['start_date']).dt.days,(df['end_date']-df['start_date']).dt.days)
df
As you already have booleand on df['status'], I would use that to the np.where condition , then either calculate the day difference df['end_period']-df['start_date']).dt.days when True either day difference (df['end_date']-df['start_date']).dt.days when False

to_string(index = False) results in non empty string even when dataframe is empty

I am doing the following in my python script and I want to hide the index column when I print the dataframe. So I used .to_string(index = False) and then use len() to see if its zero or not. However, when i do to_string(), if the dataframe is empty the len() doesn't return zero. If i print the procinject1 it says "Empty DataFrame". Any help to fix this would be greatly appreciated.
procinject1=dfmalfind[dfmalfind["Hexdump"].str.contains("MZ") == True].to_string(index = False)
if len(procinject1) == 0:
print(Fore.GREEN + "[✓]No MZ header detected in malfind preview output")
else:
print(Fore.RED + "[!]MZ header detected within malfind preview (Process Injection indicator)")
print(procinject1)
That's the expected behaviour in Pandas DataFrame.
In your case, procinject1 stores the string representation of the dataframe, which is non-empty even if the corresponding dataframe is empty.
For example, check the below code snippet, where I create an empty dataframe df and check it's string representation:
df = pd.DataFrame()
print(df.to_string(index = False))
print(df.to_string(index = True))
For both index = False and index = True cases, the output will be the same, which is given below (and that is the expected behaviour). So your corresponding len() will always return non-zero.
Empty DataFrame
Columns: []
Index: []
But if you use a non-empty dataframe, then the outputs for index = False and index = True cases will be different as given below:
data = [{'A': 10, 'B': 20, 'C':30}, {'A':5, 'B': 10, 'C': 15}]
df = pd.DataFrame(data)
print(df.to_string(index = False))
print(df.to_string(index = True))
Then the outputs for index = False and index = True cases respectively will be -
A B C
10 20 30
5 10 15
A B C
0 10 20 30
1 5 10 15
Since pandas handles empty dataframes differently, to solve your problem, you should first check whether your dataframe is empty or not, using pandas.DataFrame.empty.
Then if the dataframe is actually non-empty, you could print the string representation of that dataframe, while keeping index = False to hide the index column.

Arrays of DataFrames

How can I create an array of DataFrames?
This is cc01, the remaining ccXXs look the same:
And, this is my try:
i = 0
for ccXX in [cc01, cc02, cc03, cc04, cc05]:
i += 1
deltas[i] = ccXX.iloc[[10, 20],[-2, -1]]

Dataframe index rows all 0's

I'm iterating through PDF's to obtain the text entered in the form fields. When I send the rows to a csv file it only exports the last row. When I print results from the Dataframe, all the row indexes are 0's. I have tried various solutions from stackoverflow, but I can't get anything to work, what should be 0, 1, 2, 3...etc. are coming in as 0, 0, 0, 0...etc.
Here is what I get when printing results, only the last row exports to csv file:
0
0 1938282828
0
0 1938282828
0
0 22222222
infile = glob.glob('./*.pdf')
for i in infile:
if i.endswith('.pdf'):
pdreader = PdfFileReader(open(i,'rb'))
diction = pdreader.getFormTextFields()
myfieldvalue2 = str(diction['ID'])
df = pd.DataFrame([myfieldvalue2])
print(df)`
Thank you for any help!
You are replacing the same dataframe each time:
infile = glob.glob('./*.pdf')
for i in infile:
if i.endswith('.pdf'):
pdreader = PdfFileReader(open(i,'rb'))
diction = pdreader.getFormTextFields()
myfieldvalue2 = str(diction['ID'])
df = pd.DataFrame([myfieldvalue2]) # this creates new df each time
print(df)
Correct Code:
infile = glob.glob('./*.pdf')
df = pd.DataFrame()
for i in infile:
if i.endswith('.pdf'):
pdreader = PdfFileReader(open(i,'rb'))
diction = pdreader.getFormTextFields()
myfieldvalue2 = str(diction['ID'])
df = df.append([myfieldvalue2])
print(df)

vectorization of loop in pandas

I've been trying to vectorize the following with no such luck:
Consider two data frames. One is a list of dates:
cols = ['col1', 'col2']
index = pd.date_range('1/1/15','8/31/18')
df = pd.DataFrame(columns = cols )
What i'm doing currently is looping thru df and getting the counts of all rows that are less than or equal to the date in question with my main (large) dataframe df_main
for x in range(len(index)):
temp_arr = []
active = len(df_main[(df_main.n_date <= index[x])]
temp_arr = [index[x],active]
df= df.append(pd.Series(temp_arr,index=cols) ,ignore_index=True)
Is there a way to vectorize the above?
What about something like the following
#initializing
mycols = ['col1', 'col2']
myindex = pd.date_range('1/1/15','8/31/18')
mydf = pd.DataFrame(columns = mycols )
#create df_main (that has each of myindex's dates minus 10 days)
df_main = pd.DataFrame(data=myindex-pd.Timedelta(days=10), columns=['n_date'])
#wrap a dataframe around a list comprehension
mydf = pd.DataFrame([[x, len(df_main[df_main['n_date'] <= x])] for x in myindex])