run a job after previous job completely done in apscheduler? - apscheduler

I have some synchronization problem.
from apscheduler.schedulers.background import BackgroundScheduler, BlockingScheduler
from apscheduler.executors.pool import ThreadPoolExecutor, ProcessPoolExecutor
import time
budget = 100000
def plus() :
global budget
time.sleep(3)
budget += 20000
print('plus called, budget : ', budget)
def minus() :
global budget
budget -= 10000
print('minus called, budget : ', budget)
executors = {'default' : ProcessPoolExecutor(1)}
sched = BackgroundScheduler(executors)
sched.add_job(plus, 'interval', seconds = 5, misfire_grace_time=20, max_instances=1)
sched.add_job(minus, 'interval', seconds = 6, misfire_grace_time=20, max_instances=1)
sched.start()
How to make 'minus' starts after 'plus' to be completely done?

Related

Optimization Python

I am trying to get the optimal solution
column heading: D_name , Vial_size1 ,Vial_size2 ,Vial_size3 , cost , units_needed
row 1: Act , 120 , 400 , 0 , $5 , 738
row 2: dug , 80 , 200 , 400 , $40 , 262
data in excel
column heading: Vials price size
Row 1: Vial size 1 5 120
Row 2: Vial size 2 5 400
prob=LpProblem("Dose_Vial",LpMinimize)
import pandas as pd
df = pd.read_excel (r'C:\Users\*****\Desktop\Vial.xls')
print (df)
# Create a list of the Vial_Size
Vial_Size = list(df['Vials'])
# Create a dictinary of units for all Vial_Size
size = dict(zip(Vial_Size,df['size']))
# Create a dictinary of price for all Vial_Size
Price = dict(zip(Vial_Size,df['Price']))
# print dictionaries
print(Vial_Size)
print(size)
print(Price)
vial_vars = LpVariable.dicts("Vials",size,lowBound=0,cat='Integer')
# start building the LP problem by adding the main objective function
prob += lpSum([Price[i]*vial_vars[i]*size[i] for i in size])
# adding constraints
prob += lpSum([size[f] * vial_vars[f] for f in size]) >= 738
# The status of the solution is printed to the screen
prob.solve()
print("Status:", LpStatus[prob.status])
# In case the problem is ill-formulated or there is not sufficient information,
# the solution may be infeasible or unbounded
for v in prob.variables():
if v.varValue>0:
print(v.name, "=", format(round(v.varValue)))
Vials_Vial_Size_1 = 3
Vials_Vial_Size_2 = 1
obj =round((value(prob.objective)))
print("The total cost of optimized vials: ${}".format(round(obj)))
The total cost of optimized vials: $3800
'
how to set it for 2 or more drugs and get the best optimal solution.
Here is an approach to solve the first part of the question, finding vial combinations that minimizes the waste (I'm not sure what role the price plays?):
from pulp import *
import pandas as pd
import csv
drugs_dict = {"D_name": ['Act', 'dug'],
"Vial_size1": [120, 80],
"Vial_size2": [400, 200],
"Vial_size3": [0, 400],
"cost": [5, 40],
"units_needed": [738, 262]}
df = pd.DataFrame(drugs_dict)
drugs = list(df['D_name'])
vial_1_size = dict(zip(drugs, drugs_dict["Vial_size1"]))
vial_2_size = dict(zip(drugs, drugs_dict["Vial_size2"]))
vial_3_size = dict(zip(drugs, drugs_dict["Vial_size3"]))
units_needed = dict(zip(drugs, drugs_dict["units_needed"]))
results = []
for drug in drugs:
print(f"drug = {drug}")
# setup minimum waste problem
prob = LpProblem("Minimum Waste Problem", LpMinimize)
# create decision variables
vial_1_var = LpVariable("Vial_1", lowBound=0, cat='Integer')
vial_2_var = LpVariable("Vial_2", lowBound=0, cat='Integer')
vial_3_var = LpVariable("Vial_3", lowBound=0, cat='Integer')
units = lpSum([vial_1_size[drug] * vial_1_var +
vial_2_size[drug] * vial_2_var +
vial_3_size[drug] * vial_3_var])
# objective function
prob += units
# constraints
prob += units >= units_needed[drug]
prob.solve()
print(f"units = {units.value()}")
for v in prob.variables():
if v.varValue > 0:
print(v.name, "=", v.varValue)
results.append([drug, units.value(), int(vial_1_var.value() or 0), int(vial_2_var.value() or 0), int(vial_3_var.value() or 0)])
with open('vial_results.csv', 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['drug', 'units', 'vial_1', 'vial_2', 'vial_3'])
csv_writer.writerows(results)
Running gives:
drug = Act
units = 760.0
Vial_1 = 3.0
Vial_2 = 1.0
drug = dug
units = 280.0
Vial_1 = 1.0
Vial_2 = 1.0

How to reduce amount of scraping time when using Requests-HTML?

I currently use Requests-HTML version 0.10.0 and Selenium 3.141.0. My project is to scrape the ratings of all articles on this website https://openreview.net/group?id=ICLR.cc/2021/Conference. To open each page of the website (the website has 53 pages and each page has 50 articles), I use Selenium. Next, to open articles on each page, I use Requests-HTML. My question is about how to reduce the time uses to open each article and get the rating. In this case, I use await r_inside.html.arender(sleep = 5, timeout=100), which means the sleeping time is 5 seconds and the timeout is 100 seconds. When I try to reduce sleep time to 0.5 seconds, it will cause an error, which is because it does not have enough time to scrape the website. However, if I keep the sleep time as 5 seconds, it will take 6 to 13 hours to scrape all 2600 articles. Also, after waiting for 13 hours, I can scrape all 2600 articles, but the codes use 88 GB of RAM, which I do not prefer because I need to send this code to other people who will not have enough RAM to run. My purpose is to reduce the scraping time and RAM memory. Below is the code I use.
import csv
link = 'https://openreview.net/group?id=ICLR.cc/2021/Conference'
from requests_html import HTMLSession, AsyncHTMLSession
import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
id_list = []
keyword_list = []
abstract_list = []
title_list = []
driver = webdriver.Chrome('./requests_html/chromedriver.exe')
driver.get('https://openreview.net/group?id=ICLR.cc/2021/Conference')
cond = EC.presence_of_element_located((By.XPATH, '//*[#id="all-submissions"]/nav/ul/li[13]/a'))
WebDriverWait(driver, 10).until(cond)
for page in tqdm(range(1, 54)):
text = ''
elems = driver.find_elements_by_xpath('//*[#id="all-submissions"]/ul/li')
for i, elem in enumerate(elems):
try:
# parse title
title = elem.find_element_by_xpath('./h4/a[1]')
link = title.get_attribute('href')
paper_id = link.split('=')[-1]
title = title.text.strip().replace('\t', ' ').replace('\n', ' ')
# show details
elem.find_element_by_xpath('./a').click()
time.sleep(0.2)
# parse keywords & abstract
items = elem.find_elements_by_xpath('.//li')
keyword = ''.join([x.text for x in items if 'Keywords' in x.text])
abstract = ''.join([x.text for x in items if 'Abstract' in x.text])
keyword = keyword.strip().replace('\t', ' ').replace('\n', ' ').replace('Keywords: ', '')
abstract = abstract.strip().replace('\t', ' ').replace('\n', ' ').replace('Abstract: ', '')
text += paper_id+'\t'+title+'\t'+link+'\t'+keyword+'\t'+abstract+'\n'
title_list.append(title)
id_list.append(paper_id)
keyword_list.append(keyword)
abstract_list.append(abstract)
except Exception as e:
print(f'page {page}, # {i}:', e)
continue
# next page
try:
driver.find_element_by_xpath('//*[#id="all-submissions"]/nav/ul/li[13]/a').click()
time.sleep(2) # NOTE: increase sleep time if needed
except:
print('no next page, exit.')
break
csv_file = open('./requests_html/bb_website_scrap.csv','w', encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Title','Keyword','Abstract','Link','Total Number of Reviews','Average Rating','Average Confidence'])
n = 0
for item in range(len(id_list)):
title = title_list[item]
keyword = keyword_list[item]
abstract = abstract_list[item]
id = id_list[item]
link_pdf = f'https://openreview.net/forum?id={id}'
print(id)
asession_inside = AsyncHTMLSession()
r_inside = await asession_inside.get(link_pdf)
print(type(r_inside))
await r_inside.html.arender(sleep = 5, timeout=100)
test_rating = r_inside.html.find('div.comment-level-odd div.note_contents span.note_content_value')
print(len(test_rating))
check_list = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'}
total_rating_confidence = []
total_rating = []
total_confidence = []
for t in range(len(test_rating)):
if any(test_rating[t].text.split(':')[0] in s for s in check_list):
total_rating_confidence.append(test_rating[t].text.split(':')[0])
for r in range(len(total_rating_confidence)):
if (r % 2 == 0):
total_rating.append(int(total_rating_confidence[r]))
else:
total_confidence.append(int(total_rating_confidence[r]))
average_rating = sum(total_rating) / len(total_rating)
average_confidence = sum(total_confidence) / len(total_confidence)
csv_writer.writerow([title, keyword, abstract, link_pdf,len(total_rating),average_rating,average_confidence])
n = n + 1
print('Order {}',n)
csv_file.close()
I'm no Python expert (in fact, I'm a rank beginner) but the simple answer is better parallelism & session management.
The useful answer is a bit more complicated.
You're leaving the Chromium session around, which is likely what's hoovering up all your RAM. If you call asession_inside.close(), you may see an improvement in RAM usage.
As far as I can tell, you're doing everything in serial; You fetch each page and extract data on the articles in serial. Then, you query each article in serial as well.
You're using arender to fetch each article asynchronously, but you're awaiting it & using a standard for loop. As far as I understand, that means you're not getting any advantage from async; You're still processing each page one at a time (which explains your long process time).
I'd suggest using asyncio to turn the for loop into a parallel version of itself as suggested in this article. Make sure you set a task limit so that you don't try to load all the articles at once; That will also help with your RAM usage.

Python multiprocessing how to update a complex object in a manager list without using .join() method

I started programming in Python about 2 months ago and I've been struggling with this problem in the last 2 weeks.
I know there are many similar threads to this one but I can't really find a solution which suits my case.
I need to have the main process which is the one which interacts with Telegram and another process, buffer, which understands the complex object received from the main and updates it.
I'd like to do this in a simpler and smoother way.
At the moment objects are not being updated due to the use of multi-processing without the join() method.
I tried then to use multi-threading instead but it gives me compatibility problems with Pyrogram a framework which i am using to interact with Telegram.
I wrote again the "complexity" of my project in order to reproduce the same error I am getting and in order to get and give the best help possible from and for everyone.
a.py
class A():
def __init__(self, length = -1, height = -1):
self.length = length
self.height = height
b.py
from a import A
class B(A):
def __init__(self, length = -1, height = -1, width = -1):
super().__init__(length = -1, height = -1)
self.length = length
self.height = height
self.width = width
def setHeight(self, value):
self.height = value
c.py
class C():
def __init__(self, a, x = 0, y = 0):
self.a = a
self.x = x
self.y = y
def func1(self):
if self.x < 7:
self.x = 7
d.py
from c import C
class D(C):
def __init__(self, a, x = 0, y = 0, z = 0):
super().__init__(a, x = 0, y = 0)
self.a = a
self.x = x
self.y = y
self.z = z
def func2(self):
self.func1()
main.py
from b import B
from d import D
from multiprocessing import Process, Manager
from buffer import buffer
if __name__ == "__main__":
manager = Manager()
lizt = manager.list()
buffer = Process(target = buffer, args = (lizt, )) #passing the list as a parameter
buffer.start()
#can't invoke buffer.join() here because I need the below code to keep running while the buffer process takes a few minutes to end an instance passed in the list
#hence I can't wait the join() function to update the objects inside the buffer but i need objects updated in order to pop them out from the list
import datetime as dt
t = dt.datetime.now()
#library of kind of multithreading (pool of 4 processes), uses asyncio lib
#this while was put to reproduce the same error I am getting
while True:
if t + dt.timedelta(seconds = 10) < dt.datetime.now():
lizt.append(D(B(5, 5, 5)))
t = dt.datetime.now()
"""
#This is the code which looks like the one in my project
#main.py
from pyrogram import Client #library of kind of multithreading (pool of 4 processes), uses asyncio lib
from b import B
from d import D
from multiprocessing import Process, Manager
from buffer import buffer
if __name__ == "__main__":
api_id = 1234567
api_hash = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
app = Client("my_account", api_id, api_hash)
manager = Manager()
lizt = manager.list()
buffer = Process(target = buffer, args = (lizt, )) #passing the list as a parameter
buffer.start()
#can't invoke buffer.join() here because I need the below code to run at the same time as the buffer process
#hence I can't wait the join() function to update the objects inside the buffer
#app.on_message()
def my_handler(client, message):
lizt.append(complex_object_conatining_message)
"""
buffer.py
def buffer(buffer):
print("buffer was defined")
while True:
if len(buffer) > 0:
print(buffer[0].x) #prints 0
buffer[0].func2() #this changes the class attribute locally in the class instance but not in here
print(buffer[0].x) #prints 0, but I'd like it to be 7
print(buffer[0].a.height) #prints 5
buffer[0].a.setHeight(10) #and this has the same behaviour
print(buffer[0].a.height) #prints 5 but I'd like it to be 10
buffer.pop(0)
This is the whole code about the problem I am having.
Literally every suggestion is welcome, hopefully constructive, thank you in advance!
At last I had to change the way to solve this problem, which was using asyncio like the framework was doing as well.
This solution offers everything I was looking for:
-complex objects update
-avoiding the problems of multiprocessing (in particular with join())
It is also:
-lightweight: before I had 2 python processes 1) about 40K 2) about 75K
This actual process is about 30K (and it's also faster and cleaner)
Here's the solution, I hope it will be useful for someone else like it was for me:
The part of the classes is skipped because this solution updates complex objects absolutely fine
main.py
from pyrogram import Client
import asyncio
import time
def cancel_tasks():
#get all task in current loop
tasks = asyncio.Task.all_tasks()
for t in tasks:
t.cancel()
try:
buffer = []
firstWorker(buffer) #this one is the old buffer.py file and function
#the missing loop and loop method are explained in the next piece of code
except KeyboardInterrupt:
print("")
finally:
print("Closing Loop")
cancel_tasks()
firstWorker.py
import asyncio
def firstWorker(buffer):
print("First Worker Executed")
api_id = 1234567
api_hash = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
app = Client("my_account", api_id, api_hash)
#app.on_message()
async def my_handler(client, message):
print("Message Arrived")
buffer.append(complex_object_conatining_message)
await asyncio.sleep(1)
app.run(secondWorker(buffer)) #here is the trick: I changed the
#method run() of the Client class
#inside the Pyrogram framework
#since it was a loop itself.
#In this way I added another task
#to the existing loop in orther to
#let run both of them together.
my secondWorker.py
import asyncio
async def secondWorker(buffer):
while True:
if len(buffer) > 0:
print(buffer.pop(0))
await asyncio.sleep(1)
The resources to understand the asyncio used in this code can be found here:
Asyncio simple tutorial
Python Asyncio Official Documentation
This tutorial about how to fix classical Asyncio errors

ibpy how to get commission using interactive broker's API?

I have following code that use python API of IB, it should display both price and commission:
from ib.ext.ExecutionFilter import ExecutionFilter
from ib.ext.CommissionReport import CommissionReport
from ib.opt import ibConnection, message
from time import sleep
#-- message handlers -------------------------------------------------
# print all messages from TWS
def watcher(msg):
pass
def ExecutionDetailsHandler(msg):
global execDetails
execDetails = msg.execution
#print execDetails.m_price
#print execDetails.m_side
def CommissionDetailsHandler(msg):
global commission
commission = msg.commissionReport
# global variable that stores the last Execution seen by
ExecutionDetailsHandler
CommissionDetailsHandler
execDetails = None
commission = None
#-- factories
#-----------------------------------------------------------
def makeExecFilter():
filter=ExecutionFilter()
return filter
#-- utilities --------------------------------------------------------
def getExecutionPrice():
filter=makeExecFilter()
con.reqExecutions(744,filter)
# wait for TWS message to come back to message handler
while execDetails is None:
print 'waiting'
sleep(1)
return execDetails.m_price
def getCommission():
filter=CommissionReport()
con.commissionReport(filter)
# wait for TWS message to come back to message handler
while commission is None:
print 'waiting'
sleep(1)
return commission.m_commission
con = ibConnection()
con.registerAll(watcher)
con.register(ExecutionDetailsHandler, 'ExecDetails')
con.register(CommissionDetailsHandler, 'commissionDetails')
con.connect()
price=getExecutionPrice()
c = getCommission()
con.disconnect()
print 'The price of one execution is:', price
print 'The commission fee is:', c
however this is only working for execution price, as it displays the price information after print out. But it does not show the commission information (in my terminal it keeps waiting forever), is there anything wrong in my code?
thx for brian's answer, this does the trick:
commission = None
def commReport(msg):
global commission
#print('ID',msg.commissionReport.m_execId,'COM',msg.commissionReport.m_commission)
commission = msg.commissionReport.m_commission
conn = Connection.create(port=7496, clientId=222)
conn.register(commReport, message.commissionReport)
conn.connect()
and now I am happy to use commission anywhere i want

APScheduler - job not executed

I'm new to APScheduler and testing it before implementing it in a larger scope.
I have created the code below, but somehow my functions are never called when I use add_job with trigger='date'. If I use trigger='interval', then everything works fine.
I also tried to play with run_date param, with no luck.
Any idea about what could be wrong ?
Apscheduler is version 3.0.3
Many thanks in advance :)
def my_listener(event):
if event.exception:
print('The job crashed :(')
else:
print('The job worked :)')
def test():
print("{} ok ".format(datetime.now()))
def myFunc(content, img):
print("{} - content={}|image{}".format(datetime.now(), content, img))
myfile = open("scheduler.log", "a")
myfile.write("{} - content={}|image{}".format(datetime.now(), content, img))
myfile.close()
def main():
jobstores = \
{
'default': SQLAlchemyJobStore(url="postgresql+psycopg2://{}:{}#{}:{}/{}".format(db_user, db_password, db_host, db_port, db_database))
}
executors = \
{
'default': ThreadPoolExecutor(20),
'processpool': ProcessPoolExecutor(5)
}
job_defaults = \
{
'coalesce': False,
'max_instances': 3
}
scheduler = BackgroundScheduler(jobstores=jobstores, executors=executors, job_defaults=job_defaults, timezone=utc)
scheduler.start()
CURR_DATE = datetime.strptime(datetime.strftime(datetime.now(), '%Y%m%d%H%M'), '%Y%m%d%H%M')
JOB_DATE = CURR_DATE + timedelta(minutes=1)
uid=uuid.uuid4()
newjob = scheduler.add_job(myFunc,
trigger='date',
args=['content data', 'image data'],
kwargs=None,
id=str(uid),
name='test' + str(uid),
misfire_grace_time=5,
coalesce=False,
max_instances=1,
next_run_time= JOB_DATE,
jobstore='default',
executor='default',
replace_existing=True)
print("Added - {}".format(newjob))
scheduler.add_listener(my_listener, events.EVENT_JOB_EXECUTED | events.EVENT_JOB_ERROR)
scheduler.print_jobs()
while True:
sys.stdout.write('{}\n'.format(datetime.now())); sys.stdout.flush()
sleep(1)
if __name__ == "__main__":
main()
Problem is that you are using
next_run_time= JOB_DATE,
instead of run_date=JOB_DATE,
another option is declaring your trigger and passing it as a parameter to add job like this:
trigger = DateTrigger(run_date=start_date)
newjob = scheduler.add_job(myFunc,
trigger=trigger,
args=['content data', 'image data'],
kwargs=None,
id=str(uid),
name='test' + str(uid),
misfire_grace_time=5,
coalesce=False,
max_instances=1,
jobstore='default',
executor='default',
replace_existing=True)
another problem with your code is that while you made your scheduler timeaware you are using an unaware datetime
CURR_DATE = datetime.strptime(datetime.strftime(datetime.now(), '%Y%m%d%H%M'), '%Y%m%d%H%M')
JOB_DATE = CURR_DATE + timedelta(minutes=1) #this is unaware
try declaring your date this way:
import pytz
import datetime
job_date = datetime.datetime.now(pytz.UTC) + datetime.timedelta(minutes=1)