I have multiple files with the following naming convention.
ENCSR000EQO_0_0.txt
ENCSR000DIA_0_0.txt
ENCSR000DIA_1_1.txt
ENCSR000DIA_2_1.txt
ENCSR000DIM_0_0.txt
ENCSR000DIM_1_1.txt
ENCSR000AIB_0_0.txt
ENCSR000AIB_1_1.txt
ENCSR000AIB_2_1.txt
ENCSR000AIB_3_1.txt
I want to merge them as dataframes using pandas according to the file name, so I would have 4 resulting dataframes. And then for each of these 4, I want to groupby the gene(GeneName) column. Since the same gene will appear multiple times.
They all have the same columns in the same order. I can merge all 10 together at once, but I couldn't figure it out how to merge by name.
path = '/renamed/'
print os.listdir(path)
df_merge = None
for fname in os.listdir(path):
if fname.endswith('.txt'):
df = pd.read_csv(path + fname, sep='\t', header=0)
df.columns = ['ID ', 'Chr', 'Start', 'End', 'Strand', 'Peak Score', 'Focus Ratio/Region Size',
'Ann', 'DetAnn', 'Distance', 'PromoterID', 'EID',
'Unigene', 'Refseq', 'Ensembl', 'GeneName', 'GeneAlias',
'GeneDescription', 'GeneType']
df = df.groupby('GeneName').agg(np.mean)
print df
Thank you for any input.
I would do something more like this, where you can use glob to get the filenames, check each one, and then group the concatenated results.
import glob
path = 'renamed'
df_merge = None
for fid in ('EQO', 'DIA', 'DIM', 'AIB'):
df_ = pd.DataFrame()
for fname in glob.glob(os.path.join(path, '*.txt')):
if fid in fname:
df = pd.read_csv(fname, sep='\t', header=0)
df.columns = ['ID ', 'Chr', 'Start', 'End', 'Strand', 'Peak Score', 'Focus Ratio/Region Size',
'Ann', 'DetAnn', 'Distance', 'PromoterID', 'EID',
'Unigene', 'Refseq', 'Ensembl', 'GeneName', 'GeneAlias',
'GeneDescription', 'GeneType']
df_ = pd.concat((df_, df))
df_ = df_.groupby('GeneName').agg(np.mean)
print df_
Edit: expanding answer to be more automated.
Based on your filenames you might be able to id them as follows:
import numpy as np
files = glob.glob(os.path.join(path, '*.txt'))
fids = np.unique([file.split('_')[0] for file in files])
Putting it all together the updated code would be this:
import glob
import numpy as np
path = 'renamed'
files = glob.glob(os.path.join(path, '*.txt'))
fids = np.unique([file.split('_')[0] for file in files])
df_merge = None
for fid in fids:
df_ = pd.DataFrame()
for fname in files:
if fid in fname:
df = pd.read_csv(fname, sep='\t', header=0)
df.columns = ['ID ', 'Chr', 'Start', 'End', 'Strand', 'Peak Score', 'Focus Ratio/Region Size',
'Ann', 'DetAnn', 'Distance', 'PromoterID', 'EID',
'Unigene', 'Refseq', 'Ensembl', 'GeneName', 'GeneAlias',
'GeneDescription', 'GeneType']
df_ = pd.concat((df_, df))
df_ = df_.groupby('GeneName').agg(np.mean)
print df_
try adding the file name as column, append all df's to a list and concat them, then group:
df_merge = []
for fname in os.listdir(path):
if fname.endswith('.txt'):
df = pd.read_csv(path + fname, sep='\t', header=0)
df.columns = ['ID ', 'Chr', 'Start', 'End', 'Strand', 'Peak Score', 'Focus Ratio/Region Size',
'Ann', 'DetAnn', 'Distance', 'PromoterID', 'EID',
'Unigene', 'Refseq', 'Ensembl', 'GeneName', 'GeneAlias',
'GeneDescription', 'GeneType']
df['fname'] = [fname.split('_')[0] for x in df.index] #just to multiple by length
df_merge.append(df)
df_all = pd.concat(df_merge)
for fn in set(df_all['fname'].values):
print df_all[df_all['fname']==fn].groupby('GeneName').agg(np.mean)
Related
#error creating table with desired column
from PySimpleGUI import PySimpleGUI as sg
import pandas as pd
# Layout
sg.theme('Reddit')
layout = [
[sg.Text('Ticker: '), sg.Input(key='Ticker')],
[sg.Text('Quantidade de Papéis: '), sg.Input(key='Qtd_de_papeis')],
[sg.Text('Valor: '), sg.Input(key='Valor_pago')],
[sg.Text('Data:'), sg.Input(key='Data')],
[sg.Button('Adicionar'), sg.Button('Cancelar')]
]
# Janela
window = sg.Window('Tela de Cadastro', layout)
# Ler os eventos
while True:
events, values = window.read()
if events == (sg.WIN_CLOSED, 'Cancelar'):
break
if events == 'Adicionar':
columns = list(values.keys())
rows = list(values.values())
print(rows)
print(columns)
df.to_csv('registro.csv', sep=';', mode='a', index=False)
df_new = pd.read_csv('registro.csv', sep=';')
window.close()
print(df_new)
#I wanted to know if there is a way to assign columns or will it be necessary to create variables
Not sure if it work for you.
Here, it will create new CSV file when first record added.
You can check if file exist or not.
If exist, you can set headings = False before your event loop.
from PySimpleGUI import PySimpleGUI as sg
import pandas as pd
fields = {
'Ticker' : 'Ticker:',
'Qtd_de_papeis' : 'Quantidade de Papéis:',
'Valor_pago' : 'Valor:',
'Data' : 'Data:'
}
columns = list(fields.keys())
sg.theme('Reddit')
layout = [
[sg.Text(text), sg.Push(), sg.Input(key=key)] for key, text in fields.items()] + [
[sg.Button(button) for button in ('Adicionar', 'Cancelar')]
]
window = sg.Window('Tela de Cadastro', layout)
headings = True
while True:
events, values = window.read()
if events in (sg.WIN_CLOSED, 'Cancelar'):
break
if events == 'Adicionar':
df = pd.DataFrame({column:[] for column in columns})
df.loc[0] = [values[key] for key in columns]
if headings:
df.to_csv('registro.csv', sep=';', index=False)
else:
df.to_csv('registro.csv', sep=';', mode='a', index=False, header=False)
headings = False
df_new = pd.read_csv('registro.csv', sep=';')
print(df_new)
window.close()
Show, serviu muito bem.. Obrigado pela atenção.
I'm trying to merge/join x and y dataframes based on an exact match of the company columns and a partial match of some degree on the name columns.
Other than looking at the values returned by SequenceMatcher(None, x_name, y_name).ratio(), which were always above .8 in my case, I haven't tried much that warrants mentioning.
x = pd.DataFrame([{'id': 1, 'name': 'Robert Jackson', 'company': 'Test inc.', 'tenure': 6},
{'id': 2, 'name': 'William Johnson', 'company': 'Test inc.', 'tenure': 6}]).set_index('id')
y = pd.DataFrame([{'id': 4, 'name': 'Bob Jackson', 'company': 'Test inc.', 'job': 'desk'},
{'id': 5, 'name': 'Willy Johnson', 'company': 'Test inc.', 'job': 'desk'}]).set_index('id')
goal = pd.DataFrame([{'x_id': 1, 'y_id': 4, 'x_name': 'Robert Jackson', 'y_name': 'Bob Jackson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'},
{'x_id': 2, 'y_id': 5, 'x_name': 'William Johnson', 'y_name': 'Willy Johnson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'}])
Is something like this plausible? I'd appreciate any feedback, thank you.
Great question! I'm following to see other answers as I've been doing a lot of similar work lately. One inefficient method I've taken is to use fuzzywuzzy based on a threshold.
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
s = df_2[key2].tolist()
m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
df_1['matches'] = m
m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
df_1['matches'] = m2
return df_1
The solution that I used was:
from difflib import SequenceMatcher
x['merge_name'] = x['name']
x['merge_comp'] = x['company']
for a, b in x[['name', 'company']].values:
for ixb, (c,d) in enumerate(y[['name', 'company']].values):
if SequenceMatcher(None,a,c).ratio() >= .8:
y.loc[ixb,'merge_name'] = a
if SequenceMatcher(None,b,d).ratio() == 1:
y.loc[ixb,'merge_comp'] = b
goal = pd.merge(x,y, on=['merge_name', 'merge_comp'])
This function worked while passing arbitrary number of columns:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=False, post_drop=True):
if reset_index:
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
flag = 0
merge_columns = []
r = len(columns)
for f in range(r):
df1[prefix + columns[flag]] = df1[columns[flag]]
merge_columns.append(prefix + columns[flag])
flag =+ 1
flag = 0
for f in range(r):
for col_1 in df1[columns[flag]].values:
for index, col_2 in enumerate(df2[columns[flag]].values):
print(type(col_2))
if SequenceMatcher(None,str(col_1),str(col_2)).ratio() >= ratios[flag]:
df2.loc[index, merge_columns[flag]] = col_1
flag =+ 1
df = pd.merge(df1,df2, on=merge_columns)
if post_drop:
df1.drop(columns=merge_columns, inplace=True)
df2.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x, y, columns=['name', 'company'], ratios=[.8, 1], reset_index=True)
This function worked for passing exactly 2 columns/ratios:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=True, post_drop=True):
df1_c = df1.copy()
df2_c = df2.copy()
if reset_index:
df1_c.reset_index(inplace=True)
df2_c.reset_index(inplace=True)
df1_c[prefix + columns[0]] = df1_c[columns[0]]
df1_c[prefix + columns[1]] = df1_c[columns[1]]
merge_columns = [prefix + columns[0], prefix + columns[1]]
for col_1, col_2 in df1_c[[columns[0], columns[1]]].values:
for index, (col_3, col_4) in enumerate(df2_c[[columns[0], columns[1]]].values):
if SequenceMatcher(None, str(col_1), str(col_3)).ratio() >= ratios[0]:
df2_c.loc[index, merge_columns[0]] = col_1
if SequenceMatcher(None, str(col_2), str(col_4)).ratio() >= ratios[1]:
df2_c.loc[index, merge_columns[1]] = col_2
df = pd.merge(df1_c, df2_c, on=merge_columns)
if post_drop:
df.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x,y,columns=['name', 'company'], ratios=[.8,1])
I am trying to groupby for the following specializations but I am not getting the expected result (or any for that matter). The data stays ungrouped even after this step. Any idea what's wrong in my code?
cols_specials = ['Enterprise ID','Specialization','Specialization Branches','Specialization Type']
specials = pd.read_csv(agg_specials, engine='python')
specials = specials.merge(roster, left_on='Enterprise ID', right_on='Enterprise ID', how='left')
specials = specials[cols_specials]
specials = specials.groupby(['Enterprise ID'])['Specialization'].transform(lambda x: '; '.join(str(x)))
specials.to_csv(end_report_specials, index=False, encoding='utf-8-sig')
Please try using agg:
import pandas as pd
df = pd.DataFrame(
[
['john', 'eng', 'build'],
['john', 'math', 'build'],
['kevin', 'math', 'asp'],
['nick', 'sci', 'spi']
],
columns = ['id', 'spec', 'type']
)
df.groupby(['id'])[['spec']].agg(lambda x: ';'.join(x))
resiults in:
if you need to preserve starting number of lines, use transform. transform returns one column:
df['spec_grouped'] = df.groupby(['id'])[['spec']].transform(lambda x: ';'.join(x))
df
results in:
I have a dataframe with column names, and I want to find the one that contains a certain string, but does not exactly match it. I'm searching for 'spike' in column names like 'spike-2', 'hey spike', 'spiked-in' (the 'spike' part is always continuous).
I want the column name to be returned as a string or a variable, so I access the column later with df['name'] or df[name] as normal. I've tried to find ways to do this, to no avail. Any tips?
Just iterate over DataFrame.columns, now this is an example in which you will end up with a list of column names that match:
import pandas as pd
data = {'spike-2': [1,2,3], 'hey spke': [4,5,6], 'spiked-in': [7,8,9], 'no': [10,11,12]}
df = pd.DataFrame(data)
spike_cols = [col for col in df.columns if 'spike' in col]
print(list(df.columns))
print(spike_cols)
Output:
['hey spke', 'no', 'spike-2', 'spiked-in']
['spike-2', 'spiked-in']
Explanation:
df.columns returns a list of column names
[col for col in df.columns if 'spike' in col] iterates over the list df.columns with the variable col and adds it to the resulting list if col contains 'spike'. This syntax is list comprehension.
If you only want the resulting data set with the columns that match you can do this:
df2 = df.filter(regex='spike')
print(df2)
Output:
spike-2 spiked-in
0 1 7
1 2 8
2 3 9
This answer uses the DataFrame.filter method to do this without list comprehension:
import pandas as pd
data = {'spike-2': [1,2,3], 'hey spke': [4,5,6]}
df = pd.DataFrame(data)
print(df.filter(like='spike').columns)
Will output just 'spike-2'. You can also use regex, as some people suggested in comments above:
print(df.filter(regex='spike|spke').columns)
Will output both columns: ['spike-2', 'hey spke']
You can also use df.columns[df.columns.str.contains(pat = 'spike')]
data = {'spike-2': [1,2,3], 'hey spke': [4,5,6], 'spiked-in': [7,8,9], 'no': [10,11,12]}
df = pd.DataFrame(data)
colNames = df.columns[df.columns.str.contains(pat = 'spike')]
print(colNames)
This will output the column names: 'spike-2', 'spiked-in'
More about pandas.Series.str.contains.
# select columns containing 'spike'
df.filter(like='spike', axis=1)
You can also select by name, regular expression. Refer to: pandas.DataFrame.filter
df.loc[:,df.columns.str.contains("spike")]
Another solution that returns a subset of the df with the desired columns:
df[df.columns[df.columns.str.contains("spike|spke")]]
You also can use this code:
spike_cols =[x for x in df.columns[df.columns.str.contains('spike')]]
Getting name and subsetting based on Start, Contains, and Ends:
# from: https://stackoverflow.com/questions/21285380/find-column-whose-name-contains-a-specific-string
# from: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html
# from: https://cmdlinetips.com/2019/04/how-to-select-columns-using-prefix-suffix-of-column-names-in-pandas/
# from: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html
import pandas as pd
data = {'spike_starts': [1,2,3], 'ends_spike_starts': [4,5,6], 'ends_spike': [7,8,9], 'not': [10,11,12]}
df = pd.DataFrame(data)
print("\n")
print("----------------------------------------")
colNames_contains = df.columns[df.columns.str.contains(pat = 'spike')].tolist()
print("Contains")
print(colNames_contains)
print("\n")
print("----------------------------------------")
colNames_starts = df.columns[df.columns.str.contains(pat = '^spike')].tolist()
print("Starts")
print(colNames_starts)
print("\n")
print("----------------------------------------")
colNames_ends = df.columns[df.columns.str.contains(pat = 'spike$')].tolist()
print("Ends")
print(colNames_ends)
print("\n")
print("----------------------------------------")
df_subset_start = df.filter(regex='^spike',axis=1)
print("Starts")
print(df_subset_start)
print("\n")
print("----------------------------------------")
df_subset_contains = df.filter(regex='spike',axis=1)
print("Contains")
print(df_subset_contains)
print("\n")
print("----------------------------------------")
df_subset_ends = df.filter(regex='spike$',axis=1)
print("Ends")
print(df_subset_ends)
I normally use h5py to do the HDF5 stuff in Python and if I want to create a dataset which I want to extend later or, I do:
f = h5py.File('foo.h5', 'w')
d = f.create_dataset('whatever', (5, 5), maxshape=(None, 5), dtype='i8', chunks=True)
...
d.resize((23, 5))
...
The maxshape(None, ...) sets the first dimension to "infinity", so it's extensible.
Now I have a project where I need to stick with PyTables and wanted to build up large arrays step by step. Is there a way to extend arrays in PyTables?
This is roughly the idea:
import tables as tb
import numpy as np
filename = "foo.h5"
h5file = tb.File(filename, "a")
gbar = h5file.create_group(h5file.root, "bar", "Pressure")
h5file.create_array(gbar, 'left', np.array((1, 2, 3, 4)), "...")
# now extend the shape of (4,) and append more arrays iteratively???
h5file.close()
I found the solution in the docs: tables.EArray
http://www.pytables.org/usersguide/libref/homogenous_storage.html#earrayclassdescr
Here is a descriptive example code which adds two "columns" with two different ways of dtype definition. The with block can be called multiple times and it will extend the columns:
import tables as tb
import numpy as np
filename = 'foo.h5'
with tb.File(filename, "a") as f:
if "/foo" not in f:
group = f.create_group("/", 'foo', 'Foo Information')
else:
group = f.root.foo
if "col1" not in group:
a = tb.Atom.from_dtype(np.dtype('<f8'), dflt=0.0)
arr = f.create_earray(group, 'col1', a, (0,), "Bar")
else:
arr = getattr(group, "col1")
arr.append(np.arange(10))
arr.append(np.arange(40, 45))
if "col2" not in group:
b = tb.Int64Atom()
arr = f.create_earray(group, 'col2', b, (0,), "Baz")
else:
arr = getattr(group, "col")
arr.append(np.arange(7))
arr.append(np.arange(30, 38))