I have a routine task to upload and share a dump on S3 bucket. While the code below works, for some reason it does not want to overwrite file.
From the docs, I need to
1) define solution for two parallel executions:
path = luigi.Parameter(default=glob(DATA_DIR)[-2], batch_method=max)
2) add resources = {'overwrite_resource': 1}
While it works for the local files - it does not for the S3.
class report_to_S3(luigi.Task):
client = S3Client()
path = luigi.Parameter(default=glob(DATA_DIR)[-2], batch_method=max)
local_dump_path = '../../../data/local_db_dump.csv'
resources = {'overwrite_resource': 1}
def requires(self):
return upload_tweets(path=self.path)
def output(self):
self.s3_path = "s3://qclm-nyc-ct/dump/dump.csv"
return S3Target(self.s3_path, client=self.client)
def run(self):
c = sqa.create_engine('postgresql:///qc_soc_media')
df = pd.read_sql_query('SELECT id, user_id, timestamp, lat, lon, ct FROM tweets WHERE ct IS NOT NULL', c)
N = len(df)
df.to_csv(self.local_dump_path, index=None)
self.client.put(self.local_dump_path, self.output().path,
headers={'Content-Type': 'application/csv'})
send_S3_report(N)
if __name__ == '__main__':
luigi.run(local_scheduler=True, main_task_cls=report_to_S3)
If the target specified in the output() method already exists, the run() method will not execute. You may want to work the timestamp into the filename, or create another sentinel/flag that indicates the work is done.
Related
Suppose I create 10 multiply-imputed datasets and use the (wonderful) MatchThem package in R to create weights for my exposure variable. The MatchThem package takes a mids object and converts it to an object of the class winmids.
My desired output is a mids object - but with weights. I hope to pass this mids object to BRMS as follows:
library(brms)
m0 <- brm_multiple(Y|weights(weights) ~ A, data = mids_data)
Open to suggestions.
EDIT: Noah's solution below will unfortunately not work.
The package's first author, Farhad Pishgar, sent me the following elegant solution. It will create a mids object from a winmidsobject. Thank you Farhad!
library(mice)
library(MatchThem)
#"weighted.dataset" is our .wimids object
#Extracting the original dataset with missing value
maindataset <- complete(weighted.datasets, action = 0)
#Some spit-and-polish
maindataset <- data.frame(.imp = 0, .id = seq_len(nrow(maindataset)), maindataset)
#Extracting imputed-weighted datasets in the long format
alldataset <- complete(weighted.datasets, action = "long")
#Binding them together
alldataset <- rbind(maindataset, alldataset)
#Converting to .mids
newmids <- as.mids(alldataset)
Additionally, for BRMS, I worked out this solution which instead creates a list of dataframes. It will work in fewer steps.
library("mice")
library("dplyr")
library("MatchThem")
library("brms") # for bayesian estimation.
# Note, I realise that my approach here is not fully Bayesian, but that is a good thing! I need to ensure balance in the exposure.
# impute missing data
data("nhanes2")
imp <- mice(nhanes2, printFlag = FALSE, seed = 0, m = 10)
# MathThem. This is just a fast method
w_imp <- weightthem(hyp ~ chl + age, data = imp,
approach = "within",
estimand = "ATE",
method = "ps")
# get individual data frames with weights
out <- complete(w_imp, action ="long", include = FALSE, mild = TRUE)
# assemble individual data frames into a list
m <- 10
listdat<- list()
for (i in 1:m) {
listdat[[i]] <- as.data.frame(out[[i]])
}
# pass the list to brms, and it runs as it should!
fit_1 <- brm_multiple(bmi|weights(weights) ~ age + hyp + chl,
data = listdat,
backend = "cmdstanr",
family = "gaussian",
set_prior('normal(0, 1)',
class = 'b'))
brm_multiple() can take in a list of data frames for its data argument. You can produce this from the wimids object using complete(). The output of complete() with action = "all" is a mild object, which is a list of data frames, but this is not recognized by brm_multiple() as such. So, you can just convert it to a list. This should look like the following:
df_list <- complete(mids_data, "all")
class(df_list) <- "list"
m0 <- brm_multiple(Y|weights(weights) ~ A, data = df_list)
Using complete() automatically adds a weights column to the resulting imputed data frames.
The pictures are not displaying. The code executes just fine. I took out the api key.
def gimmePictures(num):
for n in range(0,num):
now = datetime.datetime.now()
day4Pictures= now - datetime.timedelta(days = n)
data = {'api_key':'',
'date':day4Pictures.date()}
print(data)
# using the paramas argument in our request
result = requests.get('https://api.nasa.gov/planetary/apod',params=data)
# create a dictionary for yesterday's picture
dict_day = result.json()
print(dict_day['date'])
Image(dict_day['url'])
gimmePictures(10)
How can I display an image from a file in Jupyter Notebook?
def gimmePictures(num):
listofImageNames=[]
for n in range(0,num):
now = datetime.datetime.now()
day4Pictures= now - datetime.timedelta(days = n)
data = {'api_key':'dcS6cZ9DJ4zt9oXwjF6hgemj38bNJo0IGcvFGZZj', 'date':day4Pictures.date()}
# using the paramas argument in our request
result = requests.get('https://api.nasa.gov/planetary/apod',params=data)
# create a dictionary for yesterday's picture
dict_day = result.json()
listofImageNames.append(dict_day['url'])
for imageName in listofImageNames:
display(Image(imageName))
gimmePictures(10)
I am trying to do the simplest thing with Ray, but no matter what I do it just never releases memory and fails.
The usage case is simply
read parquet files to DF -> pass to pool of actors -> make changes to DF -> return DF
class Main_func:
def calculate(self,data):
#do some things with the DF
return df.copy(deep=True) <- one of many attempts to fix the problem, but didnt work
cpus = 24
actors = []
for _ in range(cpus):
actors.append(Main_func.remote())
from ray.util import ActorPool
pool = ActorPool(actors)
import os
arr = os.listdir("/some/files")
def to_ray():
try:
filename = arr.pop(0)
pf = ParquetFile("/some/files/" + filename)
df = pf.to_pandas()
pool.submit(lambda a,v:a.calculate.remote(v),df.copy(deep=True)
except Exception as e:
print(e)
for _ in range(cpus):
to_ray()
while(True):
res = pool.get_next_unordered()
write('./temp/' + random_filename, res,compression='GZIP')
del res
to_ray()
I have tried other ways of doing the same thing, manually submitting rather than the map command, but whatever i do it always locks memory and fails after a few 100 dataframes.
Does each task needs to preserve state among different files? Ray has tasks abstraction that should simplify things:
import ray
ray.init()
#ray.remote
def read_and_write(path):
df = pd.read_parquet(path)
... do things
df.to_parquet("./temp/...")
import os
arr = os.listdir("/some/files")
results = ray.get([read_and_write.remote(path) for path in arr])
I'm trying to loop through all files in a directory and add "indicator" data to them. I had the code working where I could select 1 file and do this, but now am trying to make it work on all files. The problem is when I make the loop it says
ValueError: Invalid file path or buffer object type: <class 'list'>
The goal would be for each loop to read another file from list, make changes, and save file back to folder with changes.
Here is complete code w/o imports. I copied 1 of the "file_path"s from the list and put in comment at bottom.
### open dialog to select file
#file_path = filedialog.askopenfilename()
###create list from dir
listdrs = os.listdir('c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/')
###append full path to list
string = 'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/'
listdrs_path = [ string + x for x in listdrs]
print (listdrs_path)
###start loop, for each "file" in listdrs run the 2 functions below and overwrite saved csv.
for file in listdrs_path:
file_path = listdrs_path
data = pd.read_csv(file_path, index_col=0)
########################################
####function 1
def get_price_hist(ticker):
# Put stock price data in dataframe
data = pd.read_csv(file_path)
#listdr = os.listdir('Users\17409\AppData\Local\Programs\Python\Python38\Indicators\Sentdex Tutorial\stock_dfs')
print(listdr)
# Convert date to timestamp and make index
data.index = data["Date"].apply(lambda x: pd.Timestamp(x))
data.drop("Date", axis=1, inplace=True)
return data
df = data
##print(data)
######Indicator data#####################
def get_indicators(data):
# Get MACD
data["macd"], data["macd_signal"], data["macd_hist"] = talib.MACD(data['Close'])
# Get MA10 and MA30
data["ma10"] = talib.MA(data["Close"], timeperiod=10)
data["ma30"] = talib.MA(data["Close"], timeperiod=30)
# Get RSI
data["rsi"] = talib.RSI(data["Close"])
return data
#####end functions#######
data2 = get_indicators(data)
print(data2)
data2.to_csv(file_path)
###################################################
#here is an example of what path from list looks like
#'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/A.csv'
The problem is in line number 13 and 14. Your filename is in variable file but you are using file_path which you've assigned the file list. Because of this you are getting ValueError. Try this:
### open dialog to select file
#file_path = filedialog.askopenfilename()
###create list from dir
listdrs = os.listdir('c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/')
###append full path to list
string = 'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/'
listdrs_path = [ string + x for x in listdrs]
print (listdrs_path)
###start loop, for each "file" in listdrs run the 2 functions below and overwrite saved csv.
for file_path in listdrs_path:
data = pd.read_csv(file_path, index_col=0)
########################################
####function 1
def get_price_hist(ticker):
# Put stock price data in dataframe
data = pd.read_csv(file_path)
#listdr = os.listdir('Users\17409\AppData\Local\Programs\Python\Python38\Indicators\Sentdex Tutorial\stock_dfs')
print(listdr)
# Convert date to timestamp and make index
data.index = data["Date"].apply(lambda x: pd.Timestamp(x))
data.drop("Date", axis=1, inplace=True)
return data
df = data
##print(data)
######Indicator data#####################
def get_indicators(data):
# Get MACD
data["macd"], data["macd_signal"], data["macd_hist"] = talib.MACD(data['Close'])
# Get MA10 and MA30
data["ma10"] = talib.MA(data["Close"], timeperiod=10)
data["ma30"] = talib.MA(data["Close"], timeperiod=30)
# Get RSI
data["rsi"] = talib.RSI(data["Close"])
return data
#####end functions#######
data2 = get_indicators(data)
print(data2)
data2.to_csv(file_path)
Let me know if it helps.
I have a DataFlow job, which first reads in 2 text files, located in Google Cloud Storage. The text files contain the paths to images also located in Google Cloud Storage.
After some inspections, I can confirm that reading the text files is successful, but the DataFlow job is stuck at reading the images. The same code runs perfectly locally. Which makes me think that maybe the image paths are incorrect, but it's not.
Here's my job ID: 2018-01-10_12_16_56-8294573519126715750
Any advice would be appreciated. Also any pointers on how to solve / debug this problem would be really useful as I don't even know where to start.
Thanks
Pipeline Definition
def configure_pipeline(pipeline, args):
read_input_source = beam.io.ReadFromText(args.input_path, strip_trailing_newlines=True)
read_img_paths = beam.io.ReadFromText(args.input_imgs, strip_trailing_newlines=True)
img_paths = (pipeline | 'Read image paths' >> read_img_paths)
train_points = (pipeline | 'Read data point' >> read_input_source)
_ = (train_points | "Read image" >> beam.ParDo(ExtractDataDoFn(), beam.pvalue.AsIter(img_paths)))
Read Images - Most of the code is simply parsing the image paths from the text file and some data to index the image.
class ExtractDataDoFn(beam.DoFn):
def start_bundle(self, context=None):
# Each frame has its own path to its image
self.frame_number_to_name = {}
def process(self, element, img_paths):
try:
line = element.element
except AttributeError:
pass
if not self.frame_number_to_name:
for path in img_paths:
if len(path) > 4:
frame_number = int(path[-10 : -4])
self.frame_number_to_name[frame_number] = path
line_tokens = element.split(':')
pivot_example = line_tokens[0].strip('\'')
example = line_tokens[1].strip('\'')
label = int(line_tokens[2])
# Get image paths
pivot_frame_number = int(pivot_example.split(',')[0])
pivot_path = self.frame_number_to_name[pivot_frame_number]
example_frame_number = int(example.split(',')[0])
example_path = self.frame_number_to_name[example_frame_number]
# Read images
def _open_file_read_binary(uri):
try:
return file_io.FileIO(uri, mode='rb')
except errors.InvalidArgumentError:
return file_io.FileIO(uri, mode='r')
# Read pivot
try:
with _open_file_read_binary(pivot_path) as f:
pivot_image_bytes = f.read()
pivot_img = Image.open(io.BytesIO(pivot_image_bytes)).convert('RGB')
except Exception as e: # pylint: disable=broad-except
logging.exception('Error processing image %s: %s', pivot_example, str(e))
return
# Read example
try:
with _open_file_read_binary(example_path) as f:
example_image_bytes = f.read()
example_img = Image.open(io.BytesIO(example_image_bytes)).convert('RGB')
except Exception as e: # pylint: disable=broad-except
logging.exception('Error processing image %s: %s', example, str(e))
return
# Convert to Numpy array
pivot_np = np.array(pivot_img)
example_np = np.array(example_img)
def _get_feature(line, img):
frame_number = int(line.split(',')[0])
y, x = int(line.split(',')[3]), int(line.split(',')[2])
h, w = int(line.split(',')[5]), int(line.split(',')[4])
bb = img[y : y + h, x : x + w, :]
return bb
# Get raw content of bounding box
pivot_feature = _get_feature(pivot_example, pivot_np)
example_feature = _get_feature(example, example_np)
# Resize data
pivot_feature = Image.fromarray(pivot_feature).resize((224, 224))
example_feature = Image.fromarray(example_feature).resize((224, 224))
# Convert back to numpy
pivot_feature = np.array(pivot_feature, np.float64)
example_feature = np.array(example_feature, np.float64)
# print(pivot_feature.shape)
yield pivot_feature, example_feature, label