Can't create X_train and X_test Dataframes ( from 2 differenet csv files) and also can't use them as integer
data=pd.read_csv('action_train.csv', delimiter=';', header=0)
data=data.replace(to_replace ='[act1_]', value = '', regex = True).replace(to_replace ='[act2_]', value = '', regex = True).replace(to_replace ='[type ]', value = '', regex = True)
print(data.shape)
print(list(data.columns))
data1=pd.read_csv('action_test.csv', delimiter=';', header=0)
data1=data1.replace(to_replace ='[act1_]', value='', regex=True).replace(to_replace='[act2_]', value = '', regex = True).replace(to_replace ='[type ]', value = '', regex = True)
print(data1.shape)
print(list(data1.columns))
X_train=data['action_id', 'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9', 'char_10']
print(X_train)
y_train=data['result']
X_test=data1['action_id', 'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9', 'char_10']
print(X_test)
y_test=data1['result']
I tried to use them in different way but got tuple instead of array. Also can't convert object type in integer
Related
I have thousands of json.gz files, each with a variety of information about scientific papers. For each file, I have to extract the relevant information - e.g. title and labels - to make a dataset, then transform it to a tf.dataset. However, it is poorly efficient since I cannot filter the subjects directly or shuffle them in a single step.
I would like to read them using tf.dataset.interleave in order to shuffle them, but also to filter them according to specific labels.
Here is how I'm doing it up to now.
import tensorflow as tf
import pandas as pd
#For relevant feature extraction
def load_file(file):
#with gzip.open(bytes.decode(file), 'r') as fin: # 4. gzip
with gzip.open(file, 'r') as fin:
json_bytes = fin.read()
json_str = json_bytes.decode('utf-8') # 2. string (i.e. JSON)
bb = json.loads(json_str)
bb = pd.json_normalize(bb, 'items', ['indexed', ['title', 'publisher', 'type','indexed.date-parts', 'subject']],
errors='ignore')
bb.dropna(subset=['title', 'publisher', 'type','indexed.date-parts', 'subject'], inplace=True)
bb.subject = bb.subject.apply(lambda x: int(themes[list(set(x) & set(list(themes.keys())))[0]]) if len(list(set(x) & set(list(themes.keys()))))>0 else len(list(themes.keys()))+1)
bb.title = bb.title.str.join('').values
#bb['author'] = bb['author'].apply(lambda x: '; '.join([', '.join([i['given'], i['family']]) for i in x]))
bb['indexed.date-parts'] = bb['indexed.date-parts'].apply(lambda tpl: datetime.datetime.strptime('-'.join(str(x) for x in tpl[0]), '%Y-%m-%d').strftime('%Y-%m-%d'))
#bb = bb.sample(n=32, replace=True)
#return bb.title.str.join('').values, bb.subject.str.join(', ').values
return dict(bb[['title', 'publisher', 'type','indexed.date-parts', 'subject' ]])
file_list = ['file_2021_01/10625.json.gz',
'file_2021_01/23897.json.gz',
'file_2021_01/12169.json.gz',
'file_2021_01/427.json.gz',...]
filenames = tf.data.Dataset.list_files(file_list, shuffle=True)
dataset = filenames.apply(
tf.data.experimental.parallel_interleave(
lambda x: tf.data.Dataset.from_tensor_slices(tf.numpy_function(load_file, [x], (tf.int64))), cycle_length=1))
However, it results it a error:
InternalError: Unsupported object type dict
[[{{node PyFunc}}]] [Op:IteratorGetNext]
Thanks
I am trying to update a pre-trained model with tokens using the retokenizer. I created a pipeline in order to do this. In this pipeline, I also set "ENT_TYPE" when merging the tokens.
#Language.factory("re_tokenize")
def re_tokenize(nlp, name):
return ReTokenize(nlp.vocab)
class ReTokenize:
pattern = ""
def __init__(self, vocab):
self.pattern = r"[a-zA-Z0-9]+\[{0,1}[a-zA-Z0-9_]+\]{0,1}\[{0,1}[a-zA-Z0-9_]+\]{0,1}\[{0,1}[a-zA-Z0-9_]+\]{0,1}#{0,1}"
def __call__(self, doc):
spans = []
for match in re.finditer(self.pattern, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
if span is not None:
spans.append(span)
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span, attrs={"ENT_TYPE": "VAR"})
return doc
Using this pipeline, I can tokenize the words correctly. Also, the data in ent_type_ seems to be updated.
BEFORE:
# Set model
nlp = spacy.load("ja_ginza")
text = "aaa_bbbとaaa_CCCの2バイトマップ"
text = mojimoji.zen_to_han(text).lower()
doc = nlp(text)
print([token.text for token in doc])
print([token.ent_type_ for token in doc])
['aaa', '', 'bbb', 'と', 'aaa', '', 'ccc', 'の', '2', 'バイト', 'マップ']
['Product_Other', 'Product_Other', 'Product_Other', '', 'Product_Other', 'Product_Other', 'Product_Other', '', 'N_Product', 'N_Product', 'N_Product']
AFTER:
nlp.add_pipe("re_tokenize", before="parser")
doc = nlp(text)
print([token.text for token in doc])
print([token.ent_type_ for token in doc])
['aaa_bbb', 'と', 'aaa_ccc', 'の', '2', 'バイト', 'マップ']
['VAR', '', 'VAR', '', 'N_Product', 'N_Product', 'N_Product']
However, it seems that doc.ents is not being updated:
print([ent.label_ for ent in doc.ents])
['N_Product']
How do I also update doc.ents?
To add a single new entity to a doc without modifying any other entity annotation, use doc.set_ents():
span = doc.char_span(start, end, label="VAR")
doc.set_ents(entities=[span], default="unmodified")
More docs: https://spacy.io/api/doc#set_ents
I wrote such an code. Here i wanted to change all column that constitute TB and GB to single integer. for example if column has 2 TB, this code will delete TB and will keep it as 2. The program works good. What now i want to do is to convert 2TB to 2048 GB so that i can sum all column values. Is there any way to remove TB and make calculation on specific row at the same time?
def removeend():
df= pd.read_csv('ExportList.csv')
if df["Used Space"].str.contains("GB | TB").any() or df["Memory Size"].str.contains("GB | TB").any() or df["Host CPU"].str.contains("Hz|MHz|GHz").any():
df['Used Space'] = df['Used Space'].str.replace(r'GB|TB', '', regex=True)
df["Memory Size"] = df["Memory Size"].str.replace(r'GB|TB', '', regex=True)
df['Host CPU'] = df['Host CPU'].str.replace(r'MHz|Hz|GHz', '', regex=True)
df = df.convert_dtypes()
df["Used Space"] = pd.to_numeric(df["Used Space"])
df["Memory Size"] = pd.to_numeric(df["Memory Size"])
df["Host CPU"] = pd.to_numeric(df["Host CPU"])
else:
print("Error occured!!!")
return df
define\create a custom function:
def converter(x):
try:
return pd.eval(x)
except:
return x
Finally:
cols=["Used Space","Memory Size"]
df[cols]=df[cols].replace({'GB':'','TB':'*1024'},regex=True).applymap(converter)
df["Host CPU"]=df["Host CPU"].replace({'MHz':'','GHz':'*0.001','Hz':'*0.000001'},regex=True).map(converter)
This question already has an answer here:
How to use numpy.savetxt with a structured array that contains an array
(1 answer)
Closed 1 year ago.
import numpy as np
row_a = ['0.01722497', '', '0.09496404', '0.03654174', '0.03624997', '0.01583785', '0.02002064', '0.13934049', '0.0405615', '0.05686177', '', '0.08495372', '0.00619173', '0.00515492', '0.01053369', '0.06576333']
row_b = [0.04871661, 0.1122536, 0.20836956, 0.05473605, 0.02344445, 0.01739371, 0.00524003, 0.0640286, 0.02766152, 0.02442267, 0.04183814, 0.04853815, 0.01682549, 0.00263045, 0.00819199, 0.1631007]
dt = np.dtype([('col_1', 'U32'), ('col_2', float)])
arr = np.empty((2, len(row_a)), dtype=dt)
arr['col_1'] = row_a
arr['col_2'] = row_b
np.savetxt('table.csv', arr, delimiter=',', header='col_1,col_2', fmt='%s %f')
Code above (which is supposed to create a structured array out of an str and int array and output it to a csv) gives me the following error, even though I have 2 arrays of same length, 2 columns and 2 formats:
ValueError: fmt has wrong number of % formats: %s %f
Making a 1d structured array (as per my comment):
In [423]: row_a = ['0.01722497', '', '0.09496404', '0.03654174', '0.03624997', '0.01583785', '0
...: .02002064', '0.13934049', '0.0405615', '0.05686177', '', '0.08495372', '0.00619173',
...: '0.00515492', '0.01053369', '0.06576333']
...: row_b = [0.04871661, 0.1122536, 0.20836956, 0.05473605, 0.02344445, 0.01739371, 0.005
...: 24003, 0.0640286, 0.02766152, 0.02442267, 0.04183814, 0.04853815, 0.01682549, 0.00263
...: 045, 0.00819199, 0.1631007]
...: dt = np.dtype([('col_1', 'U32'), ('col_2', float)])
...: arr = np.empty(len(row_a), dtype=dt)
...: arr['col_1'] = row_a
...: arr['col_2'] = row_b
In [424]:
In [424]: arr
Out[424]:
array([('0.01722497', 0.04871661), ('', 0.1122536 ),
('0.09496404', 0.20836956), ('0.03654174', 0.05473605),
('0.03624997', 0.02344445), ('0.01583785', 0.01739371),
('0.02002064', 0.00524003), ('0.13934049', 0.0640286 ),
('0.0405615', 0.02766152), ('0.05686177', 0.02442267),
('', 0.04183814), ('0.08495372', 0.04853815),
('0.00619173', 0.01682549), ('0.00515492', 0.00263045),
('0.01053369', 0.00819199), ('0.06576333', 0.1631007 )],
dtype=[('col_1', '<U32'), ('col_2', '<f8')])
In [425]: arr.shape
Out[425]: (16,)
And the save:
In [426]: np.savetxt('table.csv', arr, delimiter=',', header='col_1,col_2', fmt='%s %f')
In [427]: cat table.csv
# col_1,col_2
0.01722497 0.048717
0.112254
0.09496404 0.208370
0.03654174 0.054736
...
The linked SO that I answered before had a more complex dtype. This is a simple 2 field case, so doesn't need special handling.
The """ values might give problems when file loading. I'd suggest at least using delimiter like ,, so the loader can treat it as missing value.
In [428]: np.savetxt('table.csv', arr, delimiter=',', header='col_1,col_2', fmt='%s, %f')
In [429]: cat table.csv
# col_1,col_2
0.01722497, 0.048717
, 0.112254
0.09496404, 0.208370
0.03654174, 0.054736
...
In [430]: np.genfromtxt('table.csv', dtype=None, names=True, delimiter=',')
Out[430]:
array([(0.01722497, 0.048717), ( nan, 0.112254),
(0.09496404, 0.20837 ), (0.03654174, 0.054736),
(0.03624997, 0.023444), (0.01583785, 0.017394),
In [431]: np.genfromtxt('table.csv', dtype=arr.dtype, names=True, delimiter=',')
Out[431]:
array([('0.01722497', 0.048717), ('', 0.112254), ('0.09496404', 0.20837 ),
('0.03654174', 0.054736), ('0.03624997', 0.023444),
('0.01583785', 0.017394), ('0.02002064', 0.00524 ),
I am getting familiar with scikit and its pandas integration using the Titanic tutorial on Kaggle. I have cleaned my data and would like to make some prediction. I can do it calling a pipeline fit and transform - unfortunately I get an error trying to do the same with cross_val_score.
I am using the sklearn-pandas cross_val_score
The code is as follows:
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline([
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
X = df_train[df_train.columns.drop('Survived')]
y = df_train['Survived']
#model = pipe.fit(X = X, y = y)
#prediction = model.predict(df_train)
score = cross_val_score(pipe, X = X, y = y, scoring = 'accuracy')
df_train is a Pandas dataframe containing all my training set, including outcomes. The two commented lines:
model = pipe.fit(X = X, y = y)
prediction = model.predict(df_train)
Work fine and prediction returns me an array with predicted outcomes. Using the same with cross_val_score, I get the following error:
X has 20 features per sample; expecting 19
Full code below, can be run with the Titanic CSV files on Kaggle (https://www.kaggle.com/c/titanic/data)
#%% Libraries import
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#%% Read the data
path = 'E:/Kaggle/Titanic/Data/'
file_training = 'train.csv'
file_test = 'test.csv'
#Import the training and test dataset and concatenate them
df_training = pd.read_csv(path + file_training, header = 0, index_col = 'PassengerId')
df_test = pd.read_csv(path + file_test, header = 0, index_col = 'PassengerId')
# Work on the concatenated training and test data for feature engineering and clean-up
df = pd.concat([df_training, df_test], keys = ['train','test'])
#%% Initial data exploration and cleaning
df.describe(include = 'all')
pd.isnull(df).sum() > 0
#%% Preprocesing and Cleanup
#Create new columns with the name (to identify individuals part of a family)
df['LName'] = df['Name'].apply(lambda x:x.split(',')[0].strip())
df['FName'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[1].strip())
#Get the title
df['Title'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
titleDic = {
'Master' : 'kid',
'Mlle' : 'unmarriedWoman',
'Miss' : 'unmarriedWoman',
'Ms' : 'unmarriedWoman',
'Jonkheer' : 'noble',
'Don' : 'noble',
'Dona' : 'noble',
'Sir' : 'noble',
'Lady' : 'noble',
'the Countess' : 'noble',
'Capt' : 'ranked',
'Major' : 'ranked',
'Col' : 'ranked',
'Mr' : 'standard',
'Mme' : 'standard',
'Mrs' : 'standard',
'Dr' : 'academic',
'Rev' : 'academic'
}
df['Group'] = df['Title'].map(titleDic)
#%% Working with the family size
#Get the family size
df['familySize'] = df['Parch'] + df['SibSp'] + 1
#Add a family tag (single, couple, small, large)
df['familyType'] = pd.cut(df['familySize'],
[1,2,3,5,np.inf],
labels = ['single','couple','sFamily','bFamily'],
right = False)
#%% Filling empty values
#Fill empty values with the mean or mode for the column
#Fill the missing values with mean for age per title, class and gender. Store value in AgeFull variable
agePivot = pd.DataFrame(df.groupby(['Group', 'Sex'])['Age'].median())
agePivot.columns = ['AgeFull']
df = pd.merge(df, agePivot, left_on = ['Group', 'Sex'], right_index = True)
df.loc[df['Age'].isnull(),['Age']] = df['AgeFull']
#Embark location missing values
embarkPivot = pd.DataFrame(df.groupby(['Group'])['Embarked'].agg(lambda x:x.value_counts().index[0]))
embarkPivot.columns = ['embarkFull']
df = pd.merge(df, embarkPivot, left_on = ['Group'], right_index = True)
df.loc[df['Embarked'].isnull(),['Embarked']] = df['embarkFull']
#Fill the missing fare value
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mean()
#%% Final clean-up (drop temporary columns)
df = df.drop(['AgeFull', 'embarkFull'], 1)
#%% Preparation for training
df_train = df.loc['train']
df_test = df.loc['test']
#Creation of dummy variables
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline(steps = [
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
#Uncommenting the line below fixes the code - why?
#df_train = df_train.sort_index()
X = df_train[df_train.columns.drop(['Survived'])]
y = df_train.Survived
score = cross_val_score(pipe, X = df_train, y = df_train.Survived, scoring = 'accuracy')
This is very interesting. I have solved the issue just by sorting using the index the DataFrame before passing it to the cross_val_score in the pipeline.
df_train = df_train.sort_index()
Could anyone explain me why this would have an impact on how Scikit is working?