Nested Dict To DataFrame

Nested Dict To DataFrame - pandas

CAN ANYONE HELP
msg = {'e': 'kline',
'E': 1672157513375,
's': 'BTCUSDT',
'k': {
't': 1672157460000, #REQUIRE, CONVERT MS TO DATETIME,
#RENAME AS TIME, AS INDEX
'T': 1672157519999,
's': 'BTCUSDT',
'i': '1m',
'f': 2388965371,
'L': 2388969270,
'o': '16787.32000000', #REQUIRE RENAME AS OPEN
'c': '16783.23000000', #REQUIRE RENAME AS CLOSE
'h': '16789.41000000', #REQUIRE RENAME AS HIGH
'l': '16782.69000000', #REQUIRE RENAME AS LOW
'v': '149.27507000', #REQUIRE RENAME AS VOLUME
'n': 3900,
'x': False,
'q': '2505669.98288240',
'V': '59.70465000',
'Q': '1002207.92308370',
'B': '0'
}
}
Time = k(t),datetime
Open = k(o),dtype float
High = k(h),dtype float
Low = k(l), dtype float
Close = k(c), dtype float
Volume = k (v),dtype float
index give as,
k(t) convert this millisecond to datetime,
and converted give as index
language python
WHAT I TRYING:
def getdata(msg):
frame = pd.DataFrame(msg)
#DONT UNDERSTOOD
frame = frame.loc[frame['k']['t'],frame['k']['t'],frame['k']['t'],
frame['k']['t'],frame['k']['t'],frame['k']['t']]
#SOME UNDERSTOOD
frame.columns = ["Time","Open","High","Low","Close","Volume"]
frame.set_index("Time",inplace=True)
frame.index = pd.to_datetime(frame.index,unit='ms')
frame = frame.astype(float)
return frame
getdata(msg)
REQUIRE OUTPUT:
Time Open High Low Close Volume
2022-12-27 16:11:00 16787.7 16789.4 16782.6 16783.2 149
<3

Using json_normalize():
df = (pd
.json_normalize(data=msg["k"])[["t", "o", "h", "l", "c", "v"]]
.rename(columns={"t": "Time", "o": "Open", "h": "High", "l": "Low", "c": "Close", "v": "Volume"})
)
df["Time"] = (
pd.to_datetime(df["Time"], unit="ms")
.dt.tz_localize("UTC")
.dt.tz_localize(None)
.dt.floor("S")
)
print(df)
Output:
Time Open High Low Close Volume
0 2022-12-27 16:11:00 16787.32000000 16789.41000000 16782.69000000 16783.23000000 149.27507000

Related

Highlight distinct cells based on a different cell in the same row in a multiindex pivot table

I have created a pivot table where the column headers have several levels. This is a simplified version:
index = ['Person 1', 'Person 2', 'Person 3']
columns = [
["condition 1", "condition 1", "condition 1", "condition 2", "condition 2", "condition 2"],
["Mean", "SD", "n", "Mean", "SD", "n"],
]
data = [
[100, 10, 3, 200, 12, 5],
[500, 20, 4, 750, 6, 6],
[1000, 30, 5, None, None, None],
]
df = pd.DataFrame(data, columns=columns)
df
Now I would like to highlight the adjacent cells next to SD if SD > 10. This is how it should look like:
I found this answer but couldn't make it work for multiindices.
Thanks for any help.

Use Styler.apply with custom function - for select column use DataFrame.xs and for repeat boolean use DataFrame.reindex:
def hightlight(x):
c1 = 'background-color: red'
mask = x.xs('SD', axis=1, level=1).gt(10)
#DataFrame with same index and columns names as original filled empty strings
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
#modify values of df1 column by boolean mask
return df1.mask(mask.reindex(x.columns, level=0, axis=1), c1)
df.style.apply(hightlight, axis=None)

Problem with websocket output into dataframe with pandas

I have a websocket connection to binance in my script. The websocket runs forever as usual. I got each pair's output as seperate outputs for my multiple stream connection.
for example here is the sample output:
{'stream': 'reefusdt#kline_1m', 'data': {'e': 'kline', 'E': 1651837066242, 's': 'REEFUSDT', 'k': {'t': 1651837020000, 'T': 1651837079999, 's': 'REEFUSDT', 'i': '1m', 'f': 95484416, 'L': 95484538, 'o': '0.006620', 'c': '0.006631', 'h': '0.006631', 'l': '0.006619', 'v': '1832391', 'n': 123, 'x': False, 'q': '12138.640083', 'V': '930395', 'Q': '6164.398584', 'B': '0'}}}
{'stream': 'ethusdt#kline_1m', 'data': {'e': 'kline', 'E': 1651837066253, 's': 'ETHUSDT', 'k': {'t': 1651837020000, 'T': 1651837079999, 's': 'ETHUSDT', 'i': '1m', 'f': 1613620941, 'L': 1613622573, 'o': '2671.86', 'c': '2675.79', 'h': '2675.80', 'l': '2671.81', 'v': '1018.530', 'n': 1633, 'x': False, 'q': '2723078.35891', 'V': '702.710', 'Q': '1878876.68612', 'B': '0'}}}
{'stream': 'ancusdt#kline_1m', 'data': {'e': 'kline', 'E': 1651837066257, 's': 'ANCUSDT', 'k': {'t': 1651837020000, 'T': 1651837079999, 's': 'ANCUSDT', 'i': '1m', 'f': 10991664, 'L': 10992230, 'o': '2.0750', 'c': '2.0810', 'h': '2.0820', 'l': '2.0740', 'v': '134474.7', 'n': 567, 'x': False, 'q': '279289.07500', 'V': '94837.8', 'Q': '197006.89950', 'B': '0'}}}
is there a way to edit this output like listed below. Main struggle is each one of the outputs are different dataframes. I want to merge them into one single dataframe. Output comes as a single nested dict which has two columns: "stream" and "data". "Data" has 4 columns in it and the last column "k" is another dict of 17 columns. I somehow managed to get only "k" in it:
json_message = json.loads(message)
result = json_message["data"]["k"]
and sample output is:
{'t': 1651837560000, 'T': 1651837619999, 's': 'CTSIUSDT', 'i': '1m', 'f': 27238014, 'L': 27238039, 'o': '0.2612', 'c': '0.2606', 'h': '0.2613', 'l': '0.2605', 'v': '17057', 'n': 26, 'x': False, 'q': '4449.1499', 'V': '3185', 'Q': '831.2502', 'B': '0'}
{'t': 1651837560000, 'T': 1651837619999, 's': 'ETCUSDT', 'i': '1m', 'f': 421543741, 'L': 421543977, 'o': '27.420', 'c': '27.398', 'h': '27.430', 'l': '27.397', 'v': '2988.24', 'n': 237, 'x': False, 'q': '81936.97951', 'V': '1848.40', 'Q': '50688.14941', 'B': '0'}
{'t': 1651837560000, 'T': 1651837619999, 's': 'ETHUSDT', 'i': '1m', 'f': 1613645553, 'L': 1613647188, 'o': '2671.38', 'c': '2669.95', 'h': '2672.38', 'l': '2669.70', 'v': '777.746', 'n': 1636, 'x': False, 'q': '2077574.75281', 'V': '413.365', 'Q': '1104234.98707', 'B': '0'}
I want to merge these outputs into a single dataframe of 6 columns and (almost 144 rows) which is closer to ss provided below. The only difference is my code creates different dataframes for each output.

Create a list of your messages. Your messages list should be like below:
message_list = [message1,message2,message3]
df = pd.DataFrame()
for i in range(len(message_list)):
temp_df = pd.DataFrame(message_list[i], index=[i,])
df = df.append(temp_df, ignore_index = True)
print(df)
t T s i f L o c h l v n x q V Q B
0 1651837560000 1651837619999 CTSIUSDT 1m 27238014 27238039 0.2612 0.2606 0.2613 0.2605 17057 26 False 4449.1499 3185 831.2502 0
1 1651837560000 1651837619999 ETCUSDT 1m 421543741 421543977 27.420 27.398 27.430 27.397 2988.24 237 False 81936.97951 1848.40 50688.14941 0
2 1651837560000 1651837619999 ETHUSDT 1m 1613645553 1613647188 2671.38 2669.95 2672.38 2669.70 777.746 1636 False 2077574.75281 413.365 1104234.98707 0
You can manipulate the dataframe later as needed.

How do columns work in a Pandas Dataframe after using GroupBy

Basically, I want to use iterrows method to loop through my group-by dataframe, but I can't figure out how the columns work. In the example below, it does not create a column Called "Group1" and "Group2" like one might expect. One of the columns is a dtype itself?
import pandas as pd
df = pd.DataFrame(columns=["Group1", "Group2", "Amount"])
df = df.append({"Group1": "Apple", "Group2": "Red Delicious", "Amount": 15}, ignore_index=True)
df = df.append({"Group1": "Apple", "Group2": "McIntosh", "Amount": 20}, ignore_index=True)
df = df.append({"Group1": "Apple", "Group2": "McIntosh", "Amount": 30}, ignore_index=True)
df = df.append({"Group1": "Apple", "Group2": "Fuju", "Amount": 7}, ignore_index=True)
df = df.append({"Group1": "Orange", "Group2": "Navel", "Amount": 9}, ignore_index=True)
df = df.append({"Group1": "Orange", "Group2": "Navel", "Amount": 5}, ignore_index=True)
df = df.append({"Group1": "Orange", "Group2": "Mandarin", "Amount": 12}, ignore_index=True)
print(df.dtypes)
print(df.to_string())
df_sum = df.groupby(['Group1', 'Group2']).sum(['Amount'])
print("---- Sum Results----")
print(df_sum.dtypes)
print(df_sum.to_string())
for index, row in df_sum.iterrows():
# The line below is what I want to do conceptually.
# print(row.Group1, row.Group2. row.Amount) # 'Series' object has no attribute 'Group1'
print(row.Amount) # 'Series' object has no attribute 'Group1'
The part of the output we are interested is here. I noticed that "Group1 and Group2" are on a lin below the Amount.
---- Sum Results----
Amount int64
dtype: object
Amount
Group1 Group2
Apple Fuju 7
McIntosh 50
Red Delicious 15
Orange Mandarin 12
Navel 14

Simply try:
df_sum = df.groupby(['Group1', 'Group2'])['Amount'].sum().reset_index()
OR
df_sum = df.groupby(['Group1', 'Group2'])['Amount'].agg('sum').reset_index()
Even, it Simply can be ad follows, as we are performing the sum based on the Group1 & Group2 only.
df_sum = df.groupby(['Group1', 'Group2']).sum().reset_index()
Another way:
df_sum = df.groupby(['Group1', 'Group2']).agg({'Amount': 'sum'}).reset_index()

Try to reset_index
df_sum = df.groupby(['Group1', 'Group2']).sum(['Amount']).reset_index()

Efficient column MultiIndex ordering

I have this dataframe :
df = pandas.DataFrame({'A' : [2000, 2000, 2000, 2000, 2000, 2000],
'B' : ["A+", 'B+', "A+", "B+", "A+", "B+"],
'C' : ["M", "M", "M", "F", "F", "F"],
'D' : [1, 5, 3, 4, 2, 6],
'Value' : [11, 12, 13, 14, 15, 16] }).set_index((['A', 'B', 'C', 'D']))
df = df.unstack(['C', 'D']).fillna(0)
And I'm wondering is there is a more elegant way to order the columns MultiIndex that the following code :
# rows ordering
df = df.sort_values(by = ['A', "B"], ascending = [True, True])
# col ordering
df = df.transpose().sort_values(by = ["C", "D"], ascending = [False, False]).transpose()
Especially I feel like the last line with the two transpose si far more complex than it should be. I tried using sort_index but wasn't able to use it in a MultiIndex context (for both lines and columns).

You can use sort index on both levels:
out = df.sort_index(level=[0,1],axis=1,ascending=[True, False])

I can use
axis=1
And therefore the last line become
df = df.sort_values(axis = 1, by = ["C", "D"], ascending = [True, False])

Failing to generate scalar predictions from NuPIC CLA model

I'm failing to get scalar predictions out of a CLA model.
Here's a self-contained example. It uses config to create a model using the ModelFactory. Then it trains it with a simple data set ({input_field=X, output_field=X} where X is random between 0-1). Then it attempts to extract predictions with input of the form {input_field=X, output_field=None}.
#!/usr/bin/python
import random
from nupic.frameworks.opf.modelfactory import ModelFactory
config = {
'model': "CLA",
'version': 1,
'modelParams': {
'inferenceType': 'NontemporalClassification',
'sensorParams': {
'verbosity' : 0,
'encoders': {
'_classifierInput': {
'classifierOnly': True,
'clipInput': True,
'fieldname': u'output_field',
'maxval': 1.0,
'minval': 0.0,
'n': 100,
'name': '_classifierInput',
'type': 'ScalarEncoder',
'w': 21},
u'input_field': {
'clipInput': True,
'fieldname': u'input_field',
'maxval': 1.0,
'minval': 0.0,
'n': 100,
'name': u'input_field',
'type': 'ScalarEncoder',
'w': 21},
},
},
'spEnable': False,
'tpEnable' : False,
'clParams': {
'regionName' : 'CLAClassifierRegion',
'clVerbosity' : 0,
'alpha': 0.001,
'steps': '0',
},
},
}
model = ModelFactory.create(config)
ROWS = 100
def sample():
return random.uniform(0.0, 1.0)
# training data is {input_field: X, output_field: X}
def training():
for r in range(ROWS):
value = sample()
yield {"input_field": value, "output_field": value}
# testing data is {input_field: X, output_field: None} (want output_field predicted)
def testing():
for r in range(ROWS):
value = sample()
yield {"input_field": value, "output_field": None}
model.enableInference({"predictedField": "output_field"})
model.enableLearning()
for row in training():
model.run(row)
#model.finishLearning() fails in clamodel.py
model.disableLearning()
for row in testing():
result = model.run(row)
print result.inferences # Shows None as value
The output I see is high confidence None rather than what I expect, which is something close to the input value (since the model was trained on input==output).
{'multiStepPredictions': {0: {None: 1.0}}, 'multiStepBestPredictions': {0: None}, 'anomalyScore': None}
{'multiStepPredictions': {0: {None: 0.99999999999999978}}, 'multiStepBestPredictions': {0: None}, 'anomalyScore': None}
{'multiStepPredictions': {0: {None: 1.0000000000000002}}, 'multiStepBestPredictions': {0: None}, 'anomalyScore': None}
{'multiStepPredictions': {0: {None: 1.0}}, 'multiStepBestPredictions': {0: None}, 'anomalyScore': None}
'NontemporalClassification' seems to be the right inferenceType, because it's a simple classification. But does that work with scalars?
Is there a different way of expressing that I want a prediction other than output_field=None?
I need output_field to be classifierOnly=True. Is there related configuration missing or wrong?
Thanks for your help.

Here's the working example. The key changes were
Use TemporalMultiStep as recommended by #matthew-taylor (adding required parameters)
Use "implementation": "py" in clParams. My values are in the range 0.0-1.0. The fast classifier always returns None for values in that range. The same code with "py" implementation returns valid values. Change the range to 10-100 and the fast algorithm returns valid values also. It was this change that finally produced non-None results.
Less significant than #2, in order to improve the results I repeat each training row in order to let it sink in, which makes sense for training.
To see the classifier bug, comment out line 19 "implementation": "py". The results will be None. Then change MIN_VAL to 10 and MAX_VAL to 100 and watch the results come back.
#!/usr/bin/python
import random
from nupic.frameworks.opf.modelfactory import ModelFactory
from nupic.support import initLogging
from nupic.encoders import ScalarEncoder
import numpy
MIN_VAL = 0.0
MAX_VAL = 1.0
config = {
'model': "CLA",
'version': 1,
'predictAheadTime': None,
'modelParams': {
'clParams': {
"implementation": "py", # cpp version fails with small numbers
'regionName' : 'CLAClassifierRegion',
'clVerbosity' : 0,
'alpha': 0.001,
'steps': '1',
},
'inferenceType': 'TemporalMultiStep',
'sensorParams': {
'encoders': {
'_classifierInput': {
'classifierOnly': True,
'clipInput': True,
'fieldname': 'output_field',
'maxval': MAX_VAL,
'minval': MIN_VAL,
'n': 200,
'name': '_classifierInput',
'type': 'ScalarEncoder',
'w': 21},
u'input_field': {
'clipInput': True,
'fieldname': 'input_field',
'maxval': MAX_VAL,
'minval': MIN_VAL,
'n': 100,
'name': 'input_field',
'type': 'ScalarEncoder',
'w': 21},
},
'sensorAutoReset' : None,
'verbosity' : 0,
},
'spEnable': True,
'spParams': {
'columnCount': 2048,
'globalInhibition': 1,
'spatialImp': 'cpp',
},
'tpEnable' : True,
'tpParams': { 'activationThreshold': 12,
'cellsPerColumn': 32,
'columnCount': 2048,
'temporalImp': 'cpp',
},
'trainSPNetOnlyIfRequested': False,
},
}
# end of config dictionary
model = ModelFactory.create(config)
TRAINING_ROWS = 100
TESTING_ROWS = 100
def sample(r = 0.0):
return random.uniform(MIN_VAL, MAX_VAL)
def training():
for r in range(TRAINING_ROWS):
value = sample(r / TRAINING_ROWS)
for rd in range(5):
yield {
"input_field": value,
"output_field": value,
'_reset': 1 if (rd==0) else 0,
}
def testing():
for r in range(TESTING_ROWS):
value = sample()
yield {
"input_field": value,
"output_field": None,
}
model.enableInference({"predictedField": "output_field"})
for row in training():
model.run(row)
for row in testing():
result = model.run(row)
prediction = result.inferences['multiStepBestPredictions'][1]
if prediction==None:
print "Input %f, Output None" % (row['input_field'])
else:
print "Input %f, Output %f (err %f)" % (row['input_field'], prediction, prediction - row['input_field'])

The inferenceType you want is TemporalMultistep.
See this example for a complete walkthrough.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Nested Dict To DataFrame - pandas

Related

Highlight distinct cells based on a different cell in the same row in a multiindex pivot table

Problem with websocket output into dataframe with pandas

How do columns work in a Pandas Dataframe after using GroupBy

Efficient column MultiIndex ordering

Failing to generate scalar predictions from NuPIC CLA model

Categories

Resources