trie implementation in python , object reference - trie

im looking at the following implementation of trie in python:
tree = {}
def add_to_tree(root, value_string):
for character in value_string:
root = root.setdefault(character, {})
def main():
tree={}
add_to_tree(tree, 'abc')
print tree
if __name__=="__main__":
main()
What is not clear to me is:
why is it returning {a:{b:{c:{}}}} instead of {a:{},b:{},c:{}} ?
I ran the code through this which gives a visualization of it. After iterating though 'a' I get tree = {'a':{}}, root = {} then after 'b' I get tree = {a:{b:{}}}, root={}. Whats not clear is what variable is holding the reference to {b:{}} which gets assigned to {a:{}} to change it to {a:{b:{}}} ?

You are reassigning root to the newly created dict for every character, change this line:
root = root.setdefault(character, {})
To becomes:
root.setdefault(character, {})
This gives the desired output (note that dict are unordered):
{'a': {}, 'c': {}, 'b': {}}

Related

Pipeline generation - passing in simple datastructures like lists/arrays

For a code repository project in Palantir Foundry, I am struggling with re-using some of my transformation logic.
It seems almost trivial, but: is there way to send an Input to a Transform that is not a dataset/dataframe reference?
In my case I want to pass in strings or lists/arrays.
This is my code:
from pyspark.sql import functions as F
from transforms.api import Transform, Input, Output
def my_computation(result, customFilter, scope, my_categories, my_mappings):
scope_df = scope.dataframe()
my_categories_df = my_categories.dataframe()
my_mappings_df = my_mappings.dataframe()
filtered_cat_df = (
my_categories_df
.filter(F.col('CAT_NAME').isin(customFilter))
)
# ... more logic
def generateTransforms(config):
transforms = []
for key, value in config.items():
o = {}
for outKey, outValue in value['outputs'].items():
o[outKey] = Output(outValue)
i = {}
for inpKey, inpValue in value['inputs'].items():
i[inpKey] = Input(inpValue)
i['customFilter'] = Input(value['my_custom_filter'])
transforms.append(Transform(my_computation, inputs=i, outputs=o))
return transforms
config = {
"transform_one": {
"my_custom_filter": {
"foo",
"bar"
},
"inputs": {
"scope": "/my-project/input/scope",
"my_categories": "/my-project/input/my_categories",
"my_mappings": "/my-project/input/my_mappings"
},
"outputs": {
"result": "/my-project/output/result"
}
}
}
TRANSFORMS = generateTransforms(config)
The concrete question is: how can I send in the values from my_custom_filter into customFilter in the transformation function my_computation?
If I execute it like above, I get the error "TypeError: unhashable type: 'set'"
This looks like a python issue, any chance you can point out which line is causing the error?
Reading throung your code, I would guess it's this line:
i['customFilter'] = Input(value['my_custom_filter'])
Your python logic is wrong, if we unpack your code you're trying to do this call:
i['customFilter'] = Input({"foo", "bar"})
Edit to answer the comment on how to create a python transform to lock a variable in a closure:
def create_transform(inputs={}, outputs={}, my_other_var):
#transform(**inputs, **outputs)
def compute(input_foo, input_bar, output_foobar, ctx):
df = input_foo.dataframe()
df = df.withColumn("mycol", F.lit(my_other_var))
output_foorbar.write_dataframe(df)
return compute
and now you can call this:
transforms.append(create_tranform(inputs, outptus, "foobar"))

Vertex AI Model Batch prediction, issue with referencing existing model and input file on Cloud Storage

I'm struggling to correctly set Vertex AI pipeline which does the following:
read data from API and store to GCS and as as input for batch prediction.
get an existing model (Video classification on Vertex AI)
create Batch prediction job with input from point 1.
As it will be seen, I don't have much experience with Vertex Pipelines/Kubeflow thus I'm asking for help/advice, hope it's just some beginner mistake.
this is the gist of the code I'm using as pipeline
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import dsl
from kfp.v2.dsl import component
from kfp.v2.dsl import (
Output,
Artifact,
Model,
)
PROJECT_ID = 'my-gcp-project'
BUCKET_NAME = "mybucket"
PIPELINE_ROOT = "{}/pipeline_root".format(BUCKET_NAME)
#component
def get_input_data() -> str:
# getting data from API, save to Cloud Storage
# return GS URI
gcs_batch_input_path = 'gs://somebucket/file'
return gcs_batch_input_path
#component(
base_image="python:3.9",
packages_to_install=['google-cloud-aiplatform==1.8.0']
)
def load_ml_model(project_id: str, model: Output[Artifact]):
"""Load existing Vertex model"""
import google.cloud.aiplatform as aip
model_id = '1234'
model = aip.Model(model_name=model_id, project=project_id, location='us-central1')
#dsl.pipeline(
name="batch-pipeline", pipeline_root=PIPELINE_ROOT,
)
def pipeline(gcp_project: str):
input_data = get_input_data()
ml_model = load_ml_model(gcp_project)
gcc_aip.ModelBatchPredictOp(
project=PROJECT_ID,
job_display_name=f'test-prediction',
model=ml_model.output,
gcs_source_uris=[input_data.output], # this doesn't work
# gcs_source_uris=['gs://mybucket/output/'], # hardcoded gs uri works
gcs_destination_output_uri_prefix=f'gs://{PIPELINE_ROOT}/prediction_output/'
)
if __name__ == '__main__':
from kfp.v2 import compiler
import google.cloud.aiplatform as aip
pipeline_export_filepath = 'test-pipeline.json'
compiler.Compiler().compile(pipeline_func=pipeline,
package_path=pipeline_export_filepath)
# pipeline_params = {
# 'gcp_project': PROJECT_ID,
# }
# job = aip.PipelineJob(
# display_name='test-pipeline',
# template_path=pipeline_export_filepath,
# pipeline_root=f'gs://{PIPELINE_ROOT}',
# project=PROJECT_ID,
# parameter_values=pipeline_params,
# )
# job.run()
When running the pipeline it throws this exception when running Batch prediction:
details = "List of found errors: 1.Field: batch_prediction_job.model; Message: Invalid Model resource name.
so I'm not sure what could be wrong. I tried to load model in the notebook (outside of component) and it correctly returns.
Second issue I'm having is referencing GCS URI as output from component to batch job input.
input_data = get_input_data2()
gcc_aip.ModelBatchPredictOp(
project=PROJECT_ID,
job_display_name=f'test-prediction',
model=ml_model.output,
gcs_source_uris=[input_data.output], # this doesn't work
# gcs_source_uris=['gs://mybucket/output/'], # hardcoded gs uri works
gcs_destination_output_uri_prefix=f'gs://{PIPELINE_ROOT}/prediction_output/'
)
During compilation, I get following exception TypeError: Object of type PipelineParam is not JSON serializable, though I think this could be issue of ModelBatchPredictOp component.
Again any help/advice appreciated, I'm dealing with this from yesterday, so maybe I missed something obvious.
libraries I'm using:
google-cloud-aiplatform==1.8.0
google-cloud-pipeline-components==0.2.0
kfp==1.8.10
kfp-pipeline-spec==0.1.13
kfp-server-api==1.7.1
UPDATE
After comments, some research and tuning, for referencing model this works:
#component
def load_ml_model(project_id: str, model: Output[Artifact]):
region = 'us-central1'
model_id = '1234'
model_uid = f'projects/{project_id}/locations/{region}/models/{model_id}'
model.uri = model_uid
model.metadata['resourceName'] = model_uid
and then I can use it as intended:
batch_predict_op = gcc_aip.ModelBatchPredictOp(
project=gcp_project,
job_display_name=f'batch-prediction-test',
model=ml_model.outputs['model'],
gcs_source_uris=[input_batch_gcs_path],
gcs_destination_output_uri_prefix=f'gs://{BUCKET_NAME}/prediction_output/test'
)
UPDATE 2
regarding GCS path, a workaround is to define path outside of the component and pass it as an input parameter, for example (abbreviated):
#dsl.pipeline(
name="my-pipeline",
pipeline_root=PIPELINE_ROOT,
)
def pipeline(
gcp_project: str,
region: str,
bucket: str
):
ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
gcs_prediction_input_path = f'gs://{BUCKET_NAME}/prediction_input/video_batch_prediction_input_{ts}.jsonl'
batch_input_data_op = get_input_data(gcs_prediction_input_path) # this loads input data to GCS path
batch_predict_op = gcc_aip.ModelBatchPredictOp(
project=gcp_project,
model=training_job_run_op.outputs["model"],
job_display_name='batch-prediction',
# gcs_source_uris=[batch_input_data_op.output],
gcs_source_uris=[gcs_prediction_input_path],
gcs_destination_output_uri_prefix=f'gs://{BUCKET_NAME}/prediction_output/',
).after(batch_input_data_op) # we need to add 'after' so it runs after input data is prepared since get_input_data doesn't returns anything
still not sure, why it doesn't work/compile when I return GCS path from get_input_data component
I'm glad you solved most of your main issues and found a workaround for model declaration.
For your input.output observation on gcs_source_uris, the reason behind it is because the way the function/class returns the value. If you dig inside the class/methods of google_cloud_pipeline_components you will find that it implements a structure that will allow you to use .outputs from the returned value of the function called.
If you go to the implementation of one of the components of the pipeline you will find that it returns an output array from convert_method_to_component function. So, in order to have that implemented in your custom class/function your function should return a value which can be called as an attribute. Below is a basic implementation of it.
class CustomClass():
def __init__(self):
self.return_val = {'path':'custompath','desc':'a desc'}
#property
def output(self):
return self.return_val
hello = CustomClass()
print(hello.output['path'])
If you want to dig more about it you can go to the following pages:
convert_method_to_component, which is the implementation of convert_method_to_component
Properties, basics of property in python.

How to open online reference from IPython?

Is there a way to have IPython open a browser pointed at the appropriate online reference?
Especially for numpy,scipy, matplotlib?
For example, the doc for numpy.linalg.cholesky is pretty hard to read in a terminal.
I don't think there is a direct way to make IPython or any shell to open up documentation online, because the primary job of shells is to let you interact with the things they are shells to.
We could however write a script to open a new tab on a browser with the documentation. Like so:
import webbrowser
docsList = {
"numpy" : lambda x: "https://docs.scipy.org/doc/numpy/reference/generated/" + x + ".html",
"scipy" : lambda x: "https://docs.scipy.org/doc/scipy/reference/generated/" + x + ".html",
"matplotlib" : lambda x: "https://matplotlib.org/api/" + x.split('.')[1] + "_api.html",
"default" : lambda x: "https://www.google.com/search?q=documentation+" + x
}
def online(method_name):
"""
Opens up the documentation for method_name on the default browser.
If the package doesn't match any entry in the dictionary, falls back to
Google.
Usage
-------
>>> lookUp.online("numpy.linalg.cholesky")
>>> lookUp.online("matplotlib.contour")
"""
try:
url = make_url(method_name)
except AttributeError:
print("Enter the method name as a string and try again")
return
webbrowser.open(url, new = 2)
return
def make_url(method_name):
package_name = method_name.split('.')[0]
try:
return docsList[package_name](method_name)
except KeyError:
return docsList["default"](method_name)
You could save the above as "lookUp.py" at a location that Python can find it in, and then import it whenever you need to use it.
Caveats:
This method takes strings as input, so if you call it on a function it'll throw an error.
>>> lookUp.online("numpy.linalg.cholesky")
Will work.
>>> lookUp.online(numpy.linalg.cholesky)
Will ask you to give it as a string.
So use autocomplete to get to the function and then wrap it in quotes to get it to work.

tkinter variable for drop down selection empty

I tried to program an app in tkinter that would load random lines from a file you select from a pull down menu and display the selected line in a text window.
It seems like the variable "var" in insert_text does not return the selected "option" but rather an "empty" string resulting in a the following error:
"File not found error" (FileNotFoundError: [Errno2] No such file or
directory: '').
Please help!
#!/usr/bin/env python
# Python 3
import tkinter
from tkinter import ttk
import random
class Application:
def __init__(self, root):
self.root = root
self.root.title('Random Stuff')
ttk.Frame(self.root, width=450, height=185).pack()
self.init_widgets()
var = tkinter.StringVar(root)
script = var.get()
choices = ['option1', 'option2', 'option3']
option = tkinter.OptionMenu(root, var, *choices)
option.pack(side='right', padx=10, pady=10)
def init_widgets(self):
ttk.Button(self.root, command=self.insert_txt, text='Button', width='10').place(x=10, y=10)
self.txt = tkinter.Text(self.root, width='45', height='5')
self.txt.place(x=10, y=50)
def insert_txt(self):
var = tkinter.StringVar(root)
name = var.get()
line = random.choice(open(str(name)).readlines())
self.txt.insert(tkinter.INSERT, line)
if __name__ == '__main__':
root = tkinter.Tk()
Application(root)
root.mainloop()
That's because you're just creating an empty StringVar that isn't modified later, thus returning an empty string.
The OptionMenu takes the command parameter that calls the specified method every time another option is selected. Now, you can call a method like this, replacing you insert_txt:
def __init__(self):
# ...
self.var = tkinter.StringVar()
self.options = tkinter.OptionMenu(root, var, *choices, command=self.option_selected)
# ...
def option_selected(self, event):
name = self.var.get()
# The stuff you already had
Additionally, you have to empty the Text widget, otherwise the previous text would stay. I think the Entry widget is better for that, too.

saving and deleting fileI/O

Two part question.
First, how can i change this(i've tried using 'for' but i cant figure it out) so that it saves like;
'key value' instead of '{key: value}'.
with open("phonebook.txt", "w") as x:
json.dump(a, x)
Second, how do you delete from a file by using the users input.
I cannot see a way of changing this to delete from file instead of the dict 'a';
name = input("enter name of contact you want to delete: ")
if name in a:
del a[name]
EDIT. This is what ive done now but it doesnt do whats expected ( i also tried adding the .readlines where x is but it just gets errors.
def save(a):
with open("phonebook.txt", "w") as x:
for k in a:
json.dump(str(k)+" "+str(a[k]), x)
def load():
a = {}
with open("phonebook.txt", "r") as f:
for l in f:
a[l[0]] = l[1]
print (a)
def save works fine (as far as i can see anyway)
Also i have tried c = l.split() and a[c[0]] = c[1]. Just doesnt want to work !
First part
That's not JSON format. Do not use it if you need something else. Use plain text files, like
with open("phonebook.txt","w") as file :
for key, value in a.items() :
file.write(str(key)+" "+str(value))
Second part
It looks you loaded the file into dictionary a. In that case, you just need to write dictionary a back to the file after deleting. If you have not loaded the file into the dictionary yet, you can do it with:
a= {}
with open("phonebook.txt") as file :
for line in file.readlines() :
content= line.split()
a[content[0]]= content[1]