I need to convert netCDF44 variables to .csv format .I have difficulty in exporting multiple variables using pandas library.Can you find me a solution - pandas

I have altimeter satellite data in netCDF4 format.I need to export all the variables present in data to csv format.
I have tried using pandas library.But my code is not working
import netCDF4
import pandas as pd
nc_file = 'F:/data/JASON 3/2016/00 11.11.16/JA3_GPR_2PTP000_166_20160213_230539_20160214_000152.nc'
nc = netCDF4.Dataset(nc_file, mode='r')
#print(nc)
nc.variables.keys()
range_ku=nc.variables['range_ku'][:]
range1=((range_ku*0.001)+1300000.0)
mean_sea_surface=nc.variables['mean_sea_surface'][:]
mean_sea_surf_scaled=(mean_sea_surface*0.0001)
mean_topography=nc.variables['mean_topography'][:]
mean_topo_scaled=(mean_topography*0.0001)
model_dry_tropo_corr=nc.variables['model_dry_tropo_corr'][:]
topo_cr=(model_dry_tropo_corr*0.0001)
rad_wet_tropo_corr=nc.variables['rad_wet_tropo_corr'][:]
wet_topo_cr=(rad_wet_tropo_corr*0.0001)
iono_corr_alt_ku=nc.variables['iono_corr_alt_ku'][:]
iono_cr=(iono_corr_alt_ku*0.0001)
sea_state_bias_ku=nc.variables['sea_state_bias_ku'][:]
sea_state_bias_cr=(sea_state_bias_ku*0.0001)
lat = nc.variables['lat'][:]
lon = nc.variables['lon'][:]
time_var = nc.variables['time']
dtime = netCDF4.num2date(time_var[:],time_var.units)
ssha= nc.variables['ssha'][:]
ssha1=(ssha*0.001)
alt = nc.variables['alt'][:]
alt1=((alt*0.001)+1300000.0)
print(dtime)
print(lat)
print(lon)
print(alt1)
print(range1)
print(mean_sea_surface)
print(mean_topography)
print(model_dry_tropo_corr)
print(rad_wet_tropo_corr)
print(iono_corr_alt_ku)
print(sea_state_bias_ku)
nc_ts = pd.Series(lat, lon,alt1,range1,mean_sea_surface,mean_topography,model_dry_tropo_corr,rad_wet_tropo_corr,iono_corr_alt_ku,sea_state_bias_ku,dtime)
nc_ts.to_csv('data.csv',index=False, header=True)
File "F:/untitled3.py", line 50, in <module>
nc_ts = pd.Series(lat, lon,alt1,range1,mean_sea_surface,mean_topography,model_dry_tropo_corr,rad_wet_tropo_corr,iono_corr_alt_ku,sea_state_bias_ku,dtime)
TypeError: __init__() takes from 1 to 7 positional arguments but 12 were given

Related

'poorly' organized csv file

I have a CSV file that I have to do some data processing and it's a bit of a mess. It's about 20 columns long, but there are multiple datasets that are concatenated in each column. see dummy file below
I'm trying to import each sub file into a separate pandas dataframe, but I'm not sure the best way to parse the csv other than manually hardcoding importing a certain length. any suggestions? I guess if there is some way to find where the spaces are (I could loop through the entire file and find them, and then read each block, but that doesn't seem very efficient). I have lots of csv files like this to read.
import pandas as pd
nrows = 20
skiprows = 0 #but this only reads in the first block
df = pd.read_csv(csvfile, nrows=nrows, skiprows=skiprows)
Below is a dummy example:
TIME,HDRA-1,HDRA-2,HDRA-3,HDRA-4
0.473934934,0.944026678,0.460177668,0.157028404,0.221362174
0.911384892,0.336694914,0.586014563,0.828339071,0.632790473
0.772652589,0.318146985,0.162987171,0.555896202,0.659099194
0.541382917,0.033706768,0.229596419,0.388057901,0.465507295
0.462815443,0.088206108,0.717132904,0.545779038,0.268174922
0.522861489,0.736462083,0.532785319,0.961993893,0.393424116
0.128671067,0.56740537,0.689995486,0.518493779,0.94916205
0.214026742,0.176948186,0.883636252,0.732258971,0.463732841
0.769415726,0.960761306,0.401863804,0.41823372,0.812081565
0.529750933,0.360314266,0.461615009,0.387516958,0.136616263
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.92264286,0.026312552,0.905839375,0.869477136,0.985560264
0.410573341,0.004825381,0.920616162,0.19473237,0.848603523
0.999293171,0.259955029,0.380094352,0.101050014,0.428047493
0.820216119,0.655118219,0.586754951,0.568492346,0.017038336
0.040384337,0.195101879,0.778631044,0.655215972,0.701596844
0.897559206,0.659759362,0.691643603,0.155601111,0.713735399
0.860188233,0.805013656,0.772153733,0.809025634,0.257632085
0.844167809,0.268060979,0.015993504,0.95131982,0.321210766
0.86288383,0.236599974,0.279435193,0.311005146,0.037592509
0.938348876,0.941851279,0.582434058,0.900348616,0.381844182
0.344351819,0.821571854,0.187962046,0.218234588,0.376122331
0.829766776,0.869014514,0.434165111,0.051749472,0.766748447
0.327865017,0.938176948,0.216764504,0.216666543,0.278110502
0.243953506,0.030809033,0.450110334,0.097976735,0.762393831
0.484856452,0.312943244,0.443236377,0.017201097,0.038786057
0.803696521,0.328088545,0.764850865,0.090543472,0.023363909
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.342418934,0.290979228,0.84201758,0.690964176,0.927385229
0.173485057,0.214049903,0.27438753,0.433904377,0.821778689
0.982816721,0.094490904,0.105895645,0.894103833,0.34362529
0.738593272,0.423470984,0.343551191,0.192169774,0.907698897
0.021809601,0.406001002,0.072701623,0.964640184,0.023427393
0.406226618,0.421944527,0.413150342,0.337243905,0.515996389
0.829989793,0.168974332,0.246064043,0.067662474,0.851182924
0.812736737,0.667154845,0.118274705,0.484017732,0.052666038
0.215947395,0.145078319,0.484063281,0.79414799,0.373845815
0.497877968,0.554808367,0.370429652,0.081553316,0.793608698
0.607612542,0.424703584,0.208995066,0.249033837,0.808169709
0.199613478,0.065853429,0.77236195,0.757789625,0.597225697
0.044167285,0.1024231,0.959682778,0.892311813,0.621810775
0.861175219,0.853442735,0.742542086,0.704287769,0.435969078
0.706544823,0.062501379,0.482065481,0.598698867,0.845585046
0.967217599,0.13127149,0.294860203,0.191045015,0.590202032
0.031666757,0.965674812,0.177792841,0.419935921,0.895265056
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.306849588,0.177454423,0.538670939,0.602747137,0.081221293
0.729747557,0.11762043,0.409064884,0.051577964,0.666653287
0.492543468,0.097222882,0.448642979,0.130965724,0.48613413
0.0802024,0.726352481,0.457476151,0.647556514,0.033820374
0.617976299,0.934428994,0.197735831,0.765364856,0.350880707
0.07660401,0.285816636,0.276995238,0.047003343,0.770284864
0.620820688,0.700434525,0.896417099,0.652364756,0.93838793
0.364233925,0.200229902,0.648342989,0.919306736,0.897029239
0.606100716,0.203585366,0.167232701,0.523079381,0.767224301
0.616600448,0.130377791,0.554714839,0.468486555,0.582775753
0.254480861,0.933534632,0.054558237,0.948978985,0.731855548
0.620161044,0.583061202,0.457991555,0.441254272,0.657127968
0.415874646,0.408141761,0.843133575,0.40991199,0.540792744
0.254903429,0.655739954,0.977873649,0.210656057,0.072451639
0.473680525,0.298845701,0.144989283,0.998560665,0.223980961
0.30605008,0.837920854,0.450681322,0.887787908,0.793229776
0.584644405,0.423279153,0.444505314,0.686058204,0.041154856
from io import StringIO
import pandas as pd
data ="""
TIME,HDRA-1,HDRA-2,HDRA-3,HDRA-4
0.473934934,0.944026678,0.460177668,0.157028404,0.221362174
0.911384892,0.336694914,0.586014563,0.828339071,0.632790473
0.772652589,0.318146985,0.162987171,0.555896202,0.659099194
0.541382917,0.033706768,0.229596419,0.388057901,0.465507295
0.462815443,0.088206108,0.717132904,0.545779038,0.268174922
0.522861489,0.736462083,0.532785319,0.961993893,0.393424116
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.92264286,0.026312552,0.905839375,0.869477136,0.985560264
0.410573341,0.004825381,0.920616162,0.19473237,0.848603523
0.999293171,0.259955029,0.380094352,0.101050014,0.428047493
0.820216119,0.655118219,0.586754951,0.568492346,0.017038336
0.040384337,0.195101879,0.778631044,0.655215972,0.701596844
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.342418934,0.290979228,0.84201758,0.690964176,0.927385229
0.173485057,0.214049903,0.27438753,0.433904377,0.821778689
0.982816721,0.094490904,0.105895645,0.894103833,0.34362529
0.738593272,0.423470984,0.343551191,0.192169774,0.907698897
"""
df = pd.read_csv(StringIO(data), header=None)
start_marker = 'TIME'
grouper = (df.iloc[:, 0] == start_marker).cumsum()
groups = df.groupby(grouper)
frames = [gr.T.set_index(gr.index[0]).T for _, gr in groups]

How to calculate tf-idf when working on .txt files in python 3.7?

I have books in pdf and I want to do NLP tasks such as preprocessing, tf-idf calculation, word2vec, etc on those books. So I converted them into .txt files and was trying to get tf-idf scores. Previously I performed tf-idf on a CSV file, so I made some changes in that code and tried to use it for .txt file. But I am unsuccessful in my attempt.
Below is my code:
import pandas as pd
import numpy as np
from itertools import islice
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
data = open('jungle book.txt', 'r+')
# print(data.read())
cvec = CountVectorizer(stop_words='english', min_df=1, max_df=.5, ngram_range=(1,2))
cvec.fit(data)
list(islice(cvec.vocabulary_.items(), 20))
len(cvec.vocabulary_)
cvec_count = cvec.transform(data)
print('Sparse Matrix Shape : ', cvec_count.shape)
print('Non Zero Count : ', cvec_count.nnz)
print('sparsity: %.2f%%' % (100 * cvec_count.nnz / (cvec_count.shape[0] * cvec_count.shape[1])))
occ = np.asarray(cvec_count.sum(axis=0)).ravel().tolist()
count_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences' : occ})
term_freq = count_df.sort_values(by='occurrences', ascending=False).head(20)
print(term_freq)
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_count)
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weight_df = pd.DataFrame({'term' : cvec.get_feature_names(), 'weight' : weights})
tf_idf = weight_df.sort_values(by='weight', ascending=False).head(20)
print(tf_idf)
This code is working until print ('Non Zero Count :', cvec_count.shape) and printing:
Sparse Matrix Shape : (0, 7132)
Non Zero Count : 0
Then it is giving error:
ZeroDivisionError: division by zero
Even if I run this code with ignoring ZeroDivisionError, still it is wrong as it is not counting any frequencies.
I have no idea how to work around .txt file. What is the proper way to work on .txt file for NLP tasks?
Thanks in advance!
You are getting the error because data variable is empty or wrong type. Just opening the text file is not enough. You have to read the contents into a string variable and then do the preprocessing on that variable. Try replacing
data = open('jungle book.txt', 'r+')
# print(data.read())
with
with open('jungle book.txt', 'r') as file:
data = file.read()

TypeError: ufunc add cannot use operands with types dtype('<M8[ns]') and dtype('<M8[ns]')

I am trying to set an ARIMA model to some data, for this, I used 'autocorrelation_plot()' with my time series. It's generates however the error in the title.
I have an attribute table composed, among others, of a Date and time fiels.
I extracted them (after transforming the attribute table into a numpy table), put them in a 'datetime' variable and appended them all in a list:
O,A = [],[]
dt = datetime.strptime(dt1, "%Y/%m/%d %H:%M")
A.append(dt)
I tried then to create time series and printed them to be sure of the results:
data2 = pd.Series(A, O)
print data2
The results were satisfying, until I decided to auto-correlate :
Auto-correlation command :
autocorrelation_plot(data2)
After this command, it returns:
TypeError: ufunc add cannot use operands with types dtype('M8[ns]') and dtype('M8[ns]')
I guess it's due to the conversion of the datetime.strptime to a numpy ?
I tried to follow some suggestions from previous questions
index.to_pydatetime() , dtype, M8[ns] error ..., in vain.
Minimal reproducible example:
from pandas import datetime
from pandas import DataFrame
import pandas as pd
from matplotlib import pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
arr = arcpy.da.TableToNumPyArray(inTable ,("PROVINCE","ZONE_CODE","MEAN", "Datetime","Time"))
arr_length = len(arr)
j = 1
O,A = [],[]
while j<=55: #I have 55 provinces
i = 0
while i<arr_length:
if arr[i][1]== j:
O.append(arr[i][2])
c = str(arr[i][3])
d = str(c[0:4]+"/"+c[5:7]+"/"+c[8:10])
t = str(arr[i][4])
if t=="10":
dt1 = str(d+" 10:00")
else:
dt1 = str(d+" 14:00")
dt = datetime.strptime(dt1, "%Y/%m/%d %H:%M")
A.append(dt)
i = i+1
data2 = pd.Series(A, O)
print data2
autocorrelation_plot(data2)
del A[:]
del O[:]
j += 1
Screenshot of the results:
results
I used this to solve my issue:
import matplotlib.dates as mpl_dates
df.reset_index(inplace=True)
df['Date']=df['Date'].apply(mpl_dates.date2num)
df = df.astype(float)
I found a solution, it can look barbaric, but it works!
I've just "recreated" pd.Series() with the pd.Series I had:
data2 = pd.Series(O, A)
autocorrelation_plot(pd.Series(data2))
plt.show()

Set Multiple Restrictions for Rows Called to Print in Pandas

import pandas as pd
import numpy as np
#load data
#data file and py file must be in same file path
df = pd.read_csv('cbp15st.txt', delimiter = ',', encoding = 'utf-8-
sig')
#define load data DataFrame columns
state = df['FIPSTATE']
industry = df['NAICS']
legal_form_of_organization = df['LFO']
suppression_flag = df['EMPFLAG']
total_establishment = df['EST']
establishment_1_4 = df['N1_4']
establishment_5_9 = df['N5_9']
establishment_10_19 = df['N10_19']
establishment_20_49 = df['N20_49']
establishment_50_99 = df['N50_99']
establishment_100_249 = df['N100_249']
establishment_250_499 = df['N250_499']
establishment_500_999 = df['N500_999']
establishment_1000_more = df['N1000']
#use df.loc to parse dataset for partiuclar value types
print(df.loc[df['EMPFLAG']=='A'], df.loc[df['FIPSTATE']==1],
df.loc[df['NAICS']=='------'])
Currently using df.loc to locate specific values from the df columns, but will read out those columns that contain all of these values, not only these values (like an or vs and statement)
Trying to find a way to place multiple restrictions on this to only get column reads that meet criteria x y and z.
Current Readout from above:
enter image description here
You can use & operator while specifying multiple filtering criteria, something like:
df1 = df.loc[(df['EMPFLAG']=='A']) & (df['FIPSTATE']==1) & (df['NAICS']=='------')]
print(df1)

How to read every file in folder to dataframe named after filename and overlay column names?

I am working on a project where I am downloading public data from (http://pdata.hcad.org/download/) and more particularly downloading the zip files "real_acct_ownership" and "real_building_land".
Each of these zip files contains data on homes built in the houston area, such as addresses, fixtures, sq ft, etc.
My goal is to organize the data so that all the files in the zip folder are data frames indexable by the column "account".
I am running into the issue as to how to create a function or for loop that will read and write the data into a data frame based on file name and how to overlay column names as the data in the zip folders does not contain the column names. The column names can be found in the access zip folder at the top left hand corner labeled "access.zip" of the website.
In my code so far I am calling each file from the above two folders and specifying each column name. I want this to be a iterative process as I will have to do this for other counties and would like a way to create a loop over the files in the folder.
my code so far with NO loops:
import pandas as pd
fixtures = pd.read_csv('/Users/Desktop/Real_building_land/fixtures.txt',header = None,
encoding= 'cp037', error_bad_lines=False, sep='\t')
real_acct =pd.read_csv('/Users/Desktop/Real_acct_owner/real_acct.txt', header = None,
encoding = 'cp037', error_bad_lines=False, sep='\t')
exterior = pd.read_csv('/Users/Desktop/Real_building_land/exterior.txt', header = None,
encoding = 'cp037', error_bad_lines=False, sep='\t')
fixtures.columns = ('ACCOUNT','BUILDING_NUMBER','FIXTURE_TYPE','FIXTURE_DESCRIPTION','UNITS')
real_acct.columns = ("ACCOUNT","TAX_YEAR","MAILTO","MAIL_ADDR_1","MAIL_ADDR_2","MAIL_CITY","MAIL_STATE",
"MAIL_ZIP","MAIL_COUNTRY","UNDELIVERABLE","STR_PFX" ,"STR_NUM", "STR_NUM_SFX","STR_NAME",
"STR_SFX","STR_SFX_DIR","STR_UNIT","SITE_ADDR_1","SITE_ADDR_2","SITE_ADDR_3","STATE_CLASS",
"SCHOOL_DIST","MAP_FACET","KEY_MAP","NEIGHBORHOOD_CODE","NEIGHBORHOOD_GROUP","MARKET_AREA_1",
"MARKET_AREA_1_DSCR","MARKET_AREA_2","MARKET_AREA_2_DSCR","ECON_AREA","ECON_BLD_CLASS",
"CENTER_CODE","YR_IMPR","YR_ANNEXED","SPLT_DT","DSC_CD","NXT_BUILDING","TOTAL_BUILDING_AREA",
"TOTAL_LAND_AREA","ACREAGE","CAP_ACCOUNT","SHARED_CAD_CODE","LAND_VALUE","IMPROVEMENT_VALUE",
"EXTRA_FEATURES_VALUE" ,"AG_VALUE","ASSESSED_VALUE","TOTAL_APPRAISED_VALUE","TOTAL_MARKET_VALUE",
"PRIOR_LND_VALUE","PRIOR_IMPR_VALUE","PRIOR_X_FEATURES_VALUE","PRIOR_AG_VALUE",
"PRIOR_TOTAL_APPRAISED_VALUE","PRIOR_TOTAL_MARKET_VALUE","NEW_CONSTRUCTION_VALUE",
"TOTAL_RCN_VALUE","VALUE_STATUS","NOTICED","NOTICE_DATE","PROTESTED","CERTIFIED_DATE",
"LAST_INSPECTED_DATE","LAST_INSPECTED_BY","NEW_OWNER_DATE","LEGAL_DSCR_1","LEGAL_DSCR_2",
"LEGAL_DSCR_3","LEGAL_DSCR_4","JURS")
exterior.columns = ("ACCOUNT","BUILDING_NUMBER","EXTERIOR_TYPE","EXTERIOR_DESCRIPTION","AREA")
df = fixtures.merge(real_acct,on='ACCOUNT').merge(exterior,on='ACCOUNT')
#df = df.loc[df['ACCOUNT'] == 10020000015]
print(df.shape)
Code with Few trials with loops nothing worked:
import pandas as pd
import glob
import os
dfs = {os.path.basename(f): pd.read_csv(f, sep='\t', header=None,encoding='cp037',
error_bad_lines=False) for f in glob.glob('/Users/Desktop/Real_building_land/*.txt')}
print(dfs)
path =r'path' # use your path
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
frame = pd.concat(list_)
Thank you in advance.