why does tf.estimator.DNNRegressor predict negative y value? - tensorflow
It is so weird for the predict() function in tf.estimator.DNNRegressor because it predict negative y value, but the training dataset has no negative y value. I found this when I reduced the value of y by 1000 times, say if y was 12000 before, now I change it to 12. The range of y is [3-400] now, but after I did this, the predict() function output some negative values. I didn't set the active function in tf.estimator.DNNRegressor, so the default active function is relu which range is [0-max], but why it predicts negative value? is some bug in tf.estimator.DNNRegressor? or is there no active function applied for y? Thank you.
The code is:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import pandas as pd
import tensorflow as tf
from sklearn import datasets, metrics
import csv
tf.logging.set_verbosity(tf.logging.INFO)
COLUMNS = ["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","col11","col12","col13","col14","col15","col16","col17","col18","col19","col20","col21","col22","col23","col24","col25","col26","col27","col28","col29","col30","col31","col32","col33","col34","col35","col36","col37","col38","col39","col40","col41","col42","col43","col44","col45","col46","col47","col48","col49","col50","col51","col52","col53","col54","col55","col56","col57","col58","col59","col60","col61","col62","col63","col64","col65","col66","col67","col68","col69","col70","col71","col72","col73","col74","col75","col76","col77","col78","col79","col80","col81","col82","col83","col84","col85","col86","col87","col88","col89","col90","col91","col92","col93","col94","col95","col96","col97","col98","col99","col100","col101","col102","col103","col104","col105","col106","col107","col108","col109","col110","col111","col112","col113","col114","col115","col116","col117","col118","col119","col120","col121","col122","col123","col124","col125","col126","col127","col128","col129","col130","col131","col132","col133","col134","col135","col136","col137","col138","col139","col140","col141","col142","col143","col144","col145","col146","col147","col148","col149","col150","col151","col152","col153","col154","col155","col156","col157","col158","col159","col160","col161","col162","col163","col164","col165","col166","col167","col168","col169","col170","col171","col172","col173","col174","col175","col176","col177","col178","col179","col180","col181","col182","col183","col184","col185","col186","col187","col188","col189","col190","col191","col192","col193","col194","col195","col196","col197","col198","col199","col200","col201","col202","col203","col204","col205","col206","col207","col208","col209","col210","col211","col212","col213","col214"]
FEATURES = ["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","col11","col12","col13","col14","col15","col16","col17","col18","col19","col20","col21","col22","col23","col24","col25","col26","col27","col28","col29","col30","col31","col32","col33","col34","col35","col36","col37","col38","col39","col40","col41","col42","col43","col44","col45","col46","col47","col48","col49","col50","col51","col52","col53","col54","col55","col56","col57","col58","col59","col60","col61","col62","col63","col64","col65","col66","col67","col68","col69","col70","col71","col72","col73","col74","col75","col76","col77","col78","col79","col80","col81","col82","col83","col84","col85","col86","col87","col88","col89","col90","col91","col92","col93","col94","col95","col96","col97","col98","col99","col100","col101","col102","col103","col104","col105","col106","col107","col108","col109","col110","col111","col112","col113","col114","col115","col116","col117","col118","col119","col120","col121","col122","col123","col124","col125","col126","col127","col128","col129","col130","col131","col132","col133","col134","col135","col136","col137","col138","col139","col140","col141","col142","col143","col144","col145","col146","col147","col148","col149","col150","col151","col152","col153","col154","col155","col156","col157","col158","col159","col160","col161","col162","col163","col164","col165","col166","col167","col168","col169","col170","col171","col172","col173","col174","col175","col176","col177","col178","col179","col180","col181","col182","col183","col184","col185","col186","col187","col188","col189","col190","col191","col192","col193","col194","col195","col196","col197","col198","col199","col200","col201","col202","col203","col204","col205","col206","col207","col208","col209","col211","col212","col213"]
LABEL = "col214"
def get_input_fn(data_set, num_epochs=None, shuffle=True):
return tf.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
y=pd.Series(data_set[LABEL].values),
num_epochs=num_epochs,
shuffle=shuffle)
def get_mae(y_pre, y_target):
absError = []
for i in range(len(y_pre)):
absError.append(abs(y_pre[i] - y_target[i]))
return sum(absError) / len(absError)
def get_mse(y_pre, y_target):
squaredError = []
for i in range(len(y_pre)):
val = y_pre[i] - y_target[i]
squaredError.append(val * val)
return sum(squaredError) / len (squaredError)
training_set = pd.read_csv("train.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
test_set = pd.read_csv("test.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
predict_set = pd.read_csv("predict.csv", skipinitialspace=True, skiprows=1, names=COLUMNS)
feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[250, 200, 100, 50], model_dir="./model")
regressor.train(input_fn=get_input_fn(training_set), steps=8000)
ev = regressor.evaluate(input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
loss_score = ev["loss"]
print("Loss: {0:f}".format(loss_score))
predict = regressor.predict(input_fn=get_input_fn(predict_set, num_epochs=1, shuffle=False))
y_predict = predict_set[LABEL].values.tolist()
print(type(y_predict))
print(y_predict)
list_predict = list(predict)
print(type(list_predict))
y_predicted = []
for i in range(len(list_predict)):
y_predicted.append(list_predict[i]['predictions'][0])
print(y_predicted)
fileObject = open('time_prediction.txt', 'w')
for time in y_predicted:
fileObject.write(str(time))
fileObject.write('\n')
fileObject.close()
mae = get_mae(y_predict, y_predicted)
mse = get_mse(y_predict, y_predicted)
print("Mean Absolute Error:" + str(mae) + " Mean Squared Error:" + str(mse))
#mae = tf.metrics.mean_absolute_error(y_predict, list_predict)
#print(mea)
This is the 3 data records of the dataset:
2399.998,4,100,100,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,2,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,1,4,13,4,0,11,14,15,10,8,0,0,3,1,0,0,0,0,0,0,0,0,0,0,1,364,123428,1397595,16772133,56,103,16772153,22,22,11
1919.9984,2,30,30,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,0,0,12,2,0,9,14,10,9,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,17525535,34347257,1397595,5590711,16698805,103,5913257,896853,1190468,25
479.9996,2,60,60,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,0,0,12,2,0,9,14,10,9,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,17525535,34347257,1397595,5590711,16698805,103,5913257,896853,1190468,168
The last column is y.
Related
RuntimeError: The size of tensor a (49) must match the size of tensor b (64) at non-singleton dimension 1
I have been working with Swin Transformers Attention MaP. Below is my code implementation from PIL import Image import numpy import sys from torchvision import transforms import numpy as np import cv2 def rollout(attentions, discard_ratio, head_fusion): result = torch.eye(attentions[0].size(-1)) with torch.no_grad(): for attention in attentions: # print(attentions) if head_fusion == "mean": attention_heads_fused = attention.mean(axis=1) elif head_fusion == "max": attention_heads_fused = attention.max(axis=1)[0] elif head_fusion == "min": attention_heads_fused = attention.min(axis=1)[0] else: raise "Attention head fusion type Not supported" # Drop the lowest attentions, but # don't drop the class token flat = attention_heads_fused.view(attention_heads_fused.size(0), -1) # print(flat) _, indices = flat.topk(int(flat.size(-1)*discard_ratio), -1, False) # print("_ : ",_," indices : ",indices) indices = indices[indices != 0] flat[0, indices] = 0 I = torch.eye(attention_heads_fused.size(-1)) # print("I : ",I) a = (attention_heads_fused + 1.0*I)/2 # print("a : ",a) # print(a.size()) print(a.sum(dim=-1)) a = a / a.sum(dim=-1) result = torch.matmul(a, result) # print("result : ",result) # Look at the total attention between the class token, # and the image patches mask = result[0, 0 , 1 :] # In case of 224x224 image, this brings us from 196 to 14 width = int(mask.size(-1)**0.5) mask = mask.reshape(width, width).numpy() mask = mask / np.max(mask) return mask class VITAttentionRollout: def __init__(self, model, attention_layer_name='dropout', head_fusion="mean", discard_ratio=0.9): self.model = model self.head_fusion = head_fusion self.discard_ratio = discard_ratio # print(self.model.named_modules()) for name, module in self.model.named_modules(): # print("Name : ",name," Module : ",module) if attention_layer_name in name: module.register_forward_hook(self.get_attention) # print(self.attentions) self.attentions = [] def get_attention(self, module, input, output): self.attentions.append(output.cpu()) def __call__(self, input_tensor): self.attentions = [] with torch.no_grad(): output = self.model(**input_tensor) # print(output) return rollout(self.attentions, self.discard_ratio, self.head_fusion) This is the main program import sys import torch from PIL import Image from torchvision import transforms import numpy as np import cv2 from google.colab.patches import cv2_imshow # from vit_rollout import VITAttentionRollout from vit_grad_rollout import VITAttentionGradRollout def show_mask_on_image(img, mask): img = np.float32(img) / 255 heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET) heatmap = np.float32(heatmap) / 255 cam = heatmap + np.float32(img) cam = cam / np.max(cam) return np.uint8(255 * cam) if __name__ == '__main__': model.eval() image_path = '/content/both.jpg' category_index = None head_fusion = 'max' discard_ratio = 0.9 transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]), ]) img = Image.open(image_path) img = img.resize((224, 224)) input_tensor = feature_extractor(img, return_tensors="pt") #print(input_tensor) if category_index is None: print("Doing Attention Rollout") attention_rollout = VITAttentionRollout(model, head_fusion=head_fusion, discard_ratio=discard_ratio) mask = attention_rollout(input_tensor) name = "attention_rollout_{:.3f}_{}.png".format(discard_ratio, head_fusion) else: print("Doing Gradient Attention Rollout") grad_rollout = VITAttentionGradRollout(model, discard_ratio=discard_ratio) mask = grad_rollout(input_tensor, category_index) name = "grad_rollout_{}_{:.3f}_{}.png".format(category_index, discard_ratio, head_fusion) np_img = np.array(img)[:, :, ::-1] mask = cv2.resize(mask, (np_img.shape[1], np_img.shape[0])) mask = show_mask_on_image(np_img, mask) cv2_imshow(np_img) cv2_imshow(mask) cv2.imwrite("input.jpg",np_img) cv2.imwrite(name, mask) cv2.waitKey(-1) I am referring the git project https://github.com/jacobgil/vit-explain But I am getting the error as RuntimeError: The size of tensor a (49) must match the size of tensor b (64) at non-singleton dimension 1 I researched some git projects but there is very much less information on Swin Transformers. So is there any way that I can make an attention map for Swin transformers models ? Please help with it Thanks in advance
How to calculate the confidence intervals for prediction in Regression? and also how to plot it in python
Fig 7.1, An Introduction To Statistical Learning I am currently studying a book named Introduction to Statistical Learning with applications in R, and also converting the solutions to python language. I am not able to get how to get the confidence intervals and plot them as shown in the above image(dashed lines). I have plotted the line. Here's my code for that - (I am using polynomial regression with predictiors - 'age' and response - 'wage',degree is 4) poly = PolynomialFeatures(4) X = poly.fit_transform(data['age'].to_frame()) y = data['wage'] # X.shape model = sm.OLS(y,X).fit() print(model.summary()) # So, what we want here is not only the final line, but also the standart error related to the line # TO find that we need to calcualte the predictions for some values of age test_ages = np.linspace(data['age'].min(),data['age'].max(),100) X_test = poly.transform(test_ages.reshape(-1,1)) pred = model.predict(X_test) plt.figure(figsize = (12,8)) plt.scatter(data['age'],data['wage'],facecolors='none', edgecolors='darkgray') plt.plot(test_ages,pred) Here data is WAGE data which is available in R. This is the resulting graph i get -
I have used bootstraping to calculate the confidence intervals, for this i have used a self customed module - import numpy as np import pandas as pd from tqdm import tqdm class Bootstrap_ci: def boot(self,X_data,y_data,R,test_data,model): predictions = [] for i in tqdm(range(R)): predictions.append(self.alpha(X_data,y_data,self.get_indices(X_data,200),test_data,model)) return np.percentile(predictions,2.5,axis = 0),np.percentile(predictions,97.5,axis = 0) def alpha(self,X_data,y_data,index,test_data,model): X = X_data.loc[index] y = y_data.loc[index] lr = model lr.fit(pd.DataFrame(X),y) return lr.predict(pd.DataFrame(test_data)) def get_indices(self,data,num_samples): return np.random.choice(data.index, num_samples, replace=True) The above module can be used as - poly = PolynomialFeatures(4) X = poly.fit_transform(data['age'].to_frame()) y = data['wage'] X_test = np.linspace(min(data['age']),max(data['age']),100) X_test_poly = poly.transform(X_test.reshape(-1,1)) from bootstrap import Bootstrap_ci bootstrap = Bootstrap_ci() li,ui = bootstrap.boot(pd.DataFrame(X),y,1000,X_test_poly,LinearRegression()) This will give us the lower confidence interval, and upper confidence interval. To plot the graph - plt.scatter(data['age'],data['wage'],facecolors='none', edgecolors='darkgray') plt.plot(X_test,pred,label = 'Fitted Line') plt.plot(X_test,ui,linestyle = 'dashed',color = 'r',label = 'Confidence Intervals') plt.plot(X_test,li,linestyle = 'dashed',color = 'r') The resultant graph is
Following code results in the 95% confidence interval from scipy import stats confidence = 0.95 squared_errors = (<<predicted values>> - <<true y_test values>>) ** 2 np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))
Error when using tensorflow HMC to marginalise GPR hyperparameters
I would like to use tensorflow (version 2) to use gaussian process regression to fit some data and I found the google colab example online here [1]. I have turned some of this notebook into a minimal example that is below. Sometimes the code fails with the following error when using MCMC to marginalize the hyperparameters: and I was wondering if anyone has seen this before or knows how to get around this? tensorflow.python.framework.errors_impl.InvalidArgumentError: Input matrix is not invertible. [[{{node mcmc_sample_chain/trace_scan/while/body/_168/smart_for_loop/while/body/_842/dual_averaging_step_size_adaptation___init__/_one_step/transformed_kernel_one_step/mh_one_step/hmc_kernel_one_step/leapfrog_integrate/while/body/_1244/leapfrog_integrate_one_step/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/gradients/leapfrog_integrate_one_step/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/PartitionedCall_grad/PartitionedCall/gradients/JointDistributionNamed/log_prob/JointDistributionNamed_log_prob_GaussianProcess/log_prob/JointDistributionNamed_log_prob_GaussianProcess/get_marginal_distribution/Cholesky_grad/MatrixTriangularSolve}}]] [Op:__inference_do_sampling_113645] Function call stack: do_sampling [1] https://colab.research.google.com/github/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Regression_In_TFP.ipynb#scrollTo=jw-_1yC50xaM Note that some of code below is a bit redundant but it should in some sections but it should be able to reproduce the error. Thanks! import time import numpy as np import tensorflow.compat.v2 as tf import tensorflow_probability as tfp tfb = tfp.bijectors tfd = tfp.distributions tfk = tfp.math.psd_kernels tf.enable_v2_behavior() import matplotlib import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D #%pylab inline # Configure plot defaults plt.rcParams['axes.facecolor'] = 'white' plt.rcParams['grid.color'] = '#666666' #%config InlineBackend.figure_format = 'png' def sinusoid(x): return np.sin(3 * np.pi * x[..., 0]) def generate_1d_data(num_training_points, observation_noise_variance): """Generate noisy sinusoidal observations at a random set of points. Returns: observation_index_points, observations """ index_points_ = np.random.uniform(-1., 1., (num_training_points, 1)) index_points_ = index_points_.astype(np.float64) # y = f(x) + noise observations_ = (sinusoid(index_points_) + np.random.normal(loc=0, scale=np.sqrt(observation_noise_variance), size=(num_training_points))) return index_points_, observations_ # Generate training data with a known noise level (we'll later try to recover # this value from the data). NUM_TRAINING_POINTS = 100 observation_index_points_, observations_ = generate_1d_data( num_training_points=NUM_TRAINING_POINTS, observation_noise_variance=.1) def build_gp(amplitude, length_scale, observation_noise_variance): """Defines the conditional dist. of GP outputs, given kernel parameters.""" # Create the covariance kernel, which will be shared between the prior (which we # use for maximum likelihood training) and the posterior (which we use for # posterior predictive sampling) kernel = tfk.ExponentiatedQuadratic(amplitude, length_scale) # Create the GP prior distribution, which we will use to train the model # parameters. return tfd.GaussianProcess( kernel=kernel, index_points=observation_index_points_, observation_noise_variance=observation_noise_variance) gp_joint_model = tfd.JointDistributionNamed({ 'amplitude': tfd.LogNormal(loc=0., scale=np.float64(1.)), 'length_scale': tfd.LogNormal(loc=0., scale=np.float64(1.)), 'observation_noise_variance': tfd.LogNormal(loc=0., scale=np.float64(1.)), 'observations': build_gp, }) x = gp_joint_model.sample() lp = gp_joint_model.log_prob(x) print("sampled {}".format(x)) print("log_prob of sample: {}".format(lp)) # Create the trainable model parameters, which we'll subsequently optimize. # Note that we constrain them to be strictly positive. constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp()) amplitude_var = tfp.util.TransformedVariable( initial_value=1., bijector=constrain_positive, name='amplitude', dtype=np.float64) length_scale_var = tfp.util.TransformedVariable( initial_value=1., bijector=constrain_positive, name='length_scale', dtype=np.float64) observation_noise_variance_var = tfp.util.TransformedVariable( initial_value=1., bijector=constrain_positive, name='observation_noise_variance_var', dtype=np.float64) trainable_variables = [v.trainable_variables[0] for v in [amplitude_var, length_scale_var, observation_noise_variance_var]] # Use `tf.function` to trace the loss for more efficient evaluation. #tf.function(autograph=False, experimental_compile=False) def target_log_prob(amplitude, length_scale, observation_noise_variance): return gp_joint_model.log_prob({ 'amplitude': amplitude, 'length_scale': length_scale, 'observation_noise_variance': observation_noise_variance, 'observations': observations_ }) # Now we optimize the model parameters. num_iters = 1000 optimizer = tf.optimizers.Adam(learning_rate=.01) # Store the likelihood values during training, so we can plot the progress lls_ = np.zeros(num_iters, np.float64) for i in range(num_iters): with tf.GradientTape() as tape: loss = -target_log_prob(amplitude_var, length_scale_var, observation_noise_variance_var) grads = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(grads, trainable_variables)) lls_[i] = loss print('Trained parameters:') print('amplitude: {}'.format(amplitude_var._value().numpy())) print('length_scale: {}'.format(length_scale_var._value().numpy())) print('observation_noise_variance: {}'.format(observation_noise_variance_var._value().numpy())) num_results = 100 num_burnin_steps = 50 sampler = tfp.mcmc.TransformedTransitionKernel( tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=target_log_prob, step_size=tf.cast(0.1, tf.float64), num_leapfrog_steps=8), bijector=[constrain_positive, constrain_positive, constrain_positive]) adaptive_sampler = tfp.mcmc.DualAveragingStepSizeAdaptation( inner_kernel=sampler, num_adaptation_steps=int(0.8 * num_burnin_steps), target_accept_prob=tf.cast(0.75, tf.float64)) initial_state = [tf.cast(x, tf.float64) for x in [1., 1., 1.]] # Speed up sampling by tracing with `tf.function`. #tf.function(autograph=False, experimental_compile=False) def do_sampling(): return tfp.mcmc.sample_chain( kernel=adaptive_sampler, current_state=initial_state, num_results=num_results, num_burnin_steps=num_burnin_steps, trace_fn=lambda current_state, kernel_results: kernel_results) t0 = time.time() samples, kernel_results = do_sampling() t1 = time.time() print("Inference ran in {:.2f}s.".format(t1-t0))
This can happen if you have multiple index points that are very close, so you might consider using np.linspace or just doing some post filtering of your random draw. I would also suggest a bit bigger epsilon, maybe 1e-6.
Use matplotlib to plot scikit learn linear regression results
How can you plot the linear regression results from scikit learn after the analysis to see the "testing" data (real values vs. predicted values) at the end of the program? The code below is close but I believe it is missing a scaling factor. input: import pandas as pd import numpy as np import datetime pd.core.common.is_list_like = pd.api.types.is_list_like # temp fix import fix_yahoo_finance as yf from pandas_datareader import data, wb from datetime import date from sklearn.linear_model import LinearRegression from sklearn import preprocessing, cross_validation, svm import matplotlib.pyplot as plt df = yf.download('MMM', start = date (2012, 1, 1), end = date (2018, 1, 1) , progress = False) df_low = df[['Low']] # create a new df with only the low column forecast_out = int(5) # predicting some days into future df_low['low_prediction'] = df_low[['Low']].shift(-forecast_out) # create a new column based on the existing col but shifted some days X_low = np.array(df_low.drop(['low_prediction'], 1)) X_low = preprocessing.scale(X_low) # scaling the input values X_low_forecast = X_low[-forecast_out:] # set X_forecast equal to last 5 days X_low = X_low[:-forecast_out] # remove last 5 days from X y_low = np.array(df_low['low_prediction']) y_low = y_low[:-forecast_out] X_low_train, X_low_test, y_low_train, y_low_test = cross_validation.train_test_split(X_low, y_low, test_size = 0.2) clf_low = LinearRegression() # classifier clf_low.fit(X_low_train, y_low_train) # training confidence_low = clf_low.score(X_low_test, y_low_test) # testing print("confidence for lows: ", confidence_low) forecast_prediction_low = clf_low.predict(X_low_forecast) print(forecast_prediction_low) plt.figure(figsize = (17,9)) plt.grid(True) plt.plot(X_low_test, color = "red") plt.plot(y_low_test, color = "green") plt.show() image:
You plot y_test and X_test, while you should plot y_test and clf_low.predict(X_test) instead, if you want to compare target and predicted. BTW, clf_low in your code is not a classifier, it is a regressor. It's better to use the alias model instead of clf.
OneHotEncoding mapping issue between training data and test data
I've transformed training and test data set by sklearn OneHotEncoding method. However, trnsformed results have different type shape. So It is impossible to apply to other algorithms like logistic regression. How do I reshape the test data in accordance with the training data set's shape? Best Regardings, Chris import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder, LabelEncoder def data_transformation(data, dummy): le = LabelEncoder() # Encoding the columns with multiple categorical levels for col1 in dummy: le.fit(data[col1]) data[col1] = le.transform(data[col1]) dummy_data = np.array(data[dummy]) enc = OneHotEncoder() enc.fit(dummy_data) dummy_data = enc.transform(dummy_data).toarray() if __name__ == '__main__': data = pd.read_csv('train.data', delimiter=',') data_test = pd.read_csv('test.data', delimiter=',') dummy_columns = ['Column1', 'Column2'] data = data_transformation(data, dummy_columns) data_test = data_transformation(data_test, dummy_columns) # result # data shape : (200000, 71 ) # data_test shape : ( 15000, 32)
Thank you so much, Vivek! I've solved this issue due to your help. def data_transformation2(data, data_test, dummy): le = LabelEncoder() # Encoding the columns with multiple categorical levels for col in dummy: le.fit(data[col]) data[col] = le.transform(data[col]) for col in dummy: le.fit(data_test[col]) data_test[col] = le.transform(data_test[col]) enc = OneHotEncoder() dummy_data = np.array(data[dummy]) dummy_data_test = np.array(data_test[dummy]) enc.fit(dummy_data) dummy_data = enc.transform(dummy_data).toarray() dummy_data_test = enc.transform(dummy_data_test).toarray() print(dummy_data.shape) print(dummy_data_test.shape)