I am trying to refactor the bulk upload performance in Job, as now it is taking lot of account upload time which is affecting db load and performance. can some one please help me ConfirmPendingAccountsJob (at least 50,000 accounts) is able to accept large numbers of accounts and process them efficiently
def confirm_pending_account(account_upload)
ConfirmPendingAccountsJob.perform_later(account_upload: account_upload)
end
class ConfirmPendingAccountsJob < BaseJob
queue_as :low_priority
attr_reader :accounts, :account_upload
def perform(account_upload:)
#account_upload = account_upload
pending_accounts = account_upload.pending_accounts.not_failed
#accounts = {
previously_existing: [],
newly_created: []
}
pending_accounts.each do |pending_account|
# TODO: Find duplicates in bulk
duplicate_account_in_same_organisation = pending_account.find_duplicate
if duplicate_account_in_same_organisation
accounts[:previously_existing] << [pending_account, duplicate_account_in_same_organisation]
next
end
# Verified is set before this job is run; the next line is only needed in order to run this
# job in isolation, from the command line, otherwise all account creation will be skipped
# pending_account.account_verified!
next unless pending_account.verified? # Failure already recorded by VerifyPendingAccountJob
created_account = pending_account.create_account_with_errors
if created_account.errors.any?
pending_account.record_failure!(serialized_errors: created_account.errors.to_hash)
else
accounts[:newly_created] << [pending_account, created_account]
end
end
finalise_accounts_creation
end
private
def finalise_accounts_creation
move_into_teams_and_licences
accounts[:previously_existing].each do |bundle|
update_pending_account(bundle.pending_account, bundle.account, status: :updated)
end
accounts[:newly_created].each do |pending_account, account|
update_pending_account(pending_account, account, status: :created)
account.confirm
end
end
def move_into_teams_and_licences
Team.find(account_upload.team_ids).each do |team|
team.add_accounts(all_accounts)
end
Licensing::Public::API.find_licence(account_upload.licence_ids).each do |licence|
licence.bulk_enrol_accounts(all_accounts)
end
EventPublisher.account_licences_changed(all_accounts)
end
def update_pending_account(pending_account, account, status:)
if status == :created
pending_account.create_account!
else
pending_account.update_account!
end
# This establishes the link from an account to a pending account
pending_account.update!(account: account)
end
def all_accounts
newly_created_accounts + previously_existing_accounts
end
def previously_existing_accounts
#accounts[:previously_existing].map{|bundle| bundle.account }
end
def newly_created_accounts
#accounts[:newly_created].map{|_,account| account }
end
end
I want to send batches of accounts to downstream methods (most importantly team.add_accounts but also licence.bulk_enrol_accounts) to enable more efficient processing.
Want to handle batching in a same way, so that the job can accept an arbitrary number of accounts but individual jobs will process a manageable number.
If necessary to protect the DB, the job will also accept parameters for batch size and delay between batches to enable
Related
There is a program of three modules. The Print module receives a number from the keyboard, passes it to another module, receives the response, and displays it on the screen. The Proc1 and Proc2 modules receive a number, perform calculations, and send the result back.
defmodule Launch do
#moduledoc """
Documentation for `Launch`.
"""
#doc """
"""
def start() do
children = [
%{
id: Print,
start: {Print, :print, []}
},
%{
id: Proc1,
start: {Proc1, :proc1, []}
},
%{
id: Proc2,
start: {Proc2, :proc2, []}
}
]
Supervisor.start_link(children, strategy: :one_for_one)
end
end
defmodule Print do
def print() do
num =
IO.gets("Input number: ")
|> String.trim()
|> String.to_integer()
if num >= 0 do
send(Proc1, {self(), num})
else
send(Proc2, {self(), num})
end
receive do
num -> IO.puts(num)
after
500 ->
print()
end
print()
end
end
defmodule Proc1 do
def proc1() do
receive do
{pid, num} ->
send(pid, 100/num)
proc1()
_e ->
IO.puts("Error")
end
end
end
defmodule Proc2 do
def proc2() do
receive do
{pid, num} ->
send(pid, 1000/num)
proc2()
_e ->
IO.puts("Error")
end
end
end
I am trying to run all processes under the supervision of a single Supervisor. But there is a problem-only the first "child" is started, the other "children" are not started. In the example above, the Print process will start, but Proc1 and Proc2 will not start. How do I run all processes under one Supervisor? Important note: the Print process must get the addresses of the Proc1 and Proc2 processes for communication.
There are many issues with the code you’ve posted.
Registered processes
To be able to use process name as Process.dest() in a call to Kernel.send/2, one should start the named process.
Supervisor.start_link/2
Supervisor.start_link/2 expects a list of tuples, with modules and functions that immediately return, having the process started as a side effect. These functions are called, and there would not be any magic: if this is an infinitely recursive function, the execution flow would be deadlocked inside, waiting for the message in receive/1.
Supervisor performs some magic by automatically monitoring and restarting children for you, but it does nothing to spawn the separate processes. GenServer encapsulates this functionality and provides a handy way to not bother about spawning processes.
Solution
What you might do, is to spawn all three processes, manually monitor them, and react on {:DOWN, ref, :process, pid, reason} message respawning the died process. This is exactly what Supervisor effectively does under the hood for children.
Launch
defmodule Launch do
def start() do
proc1 = spawn(&Proc1.proc1/0)
proc2 = spawn(&Proc2.proc2/0)
print = spawn(fn -> Print.print(proc1, proc2) end)
Process.monitor(proc1)
Process.monitor(proc2)
Process.monitor(print)
receive do
msg -> IO.inspect(msg)
end
end
end
Print
defmodule Print do
def print(pid1, pid2) do
num =
IO.gets("Input number: ")
|> String.trim()
|> String.to_integer()
if num >= 0 do
send(pid1, {self(), num})
else
send(pid2, {self(), num})
end
receive do
num -> IO.puts(num)
end
print(pid1, pid2)
end
end
The other two modules are fine.
Here is how it will look like in iex
iex|1 ▶ c "/tmp/test.ex"
#⇒ [Launch, Print, Proc1, Proc2]
iex|2 ▶ Launch.start
Input number: 10
10.0
Input number: 1000
0.1
Input number: a
#⇒ {:DOWN, #Reference<0.3632020665.3980394506.95298>,
# :process, #PID<0.137.0>,
# {:badarg,
# [
# {:erlang, :binary_to_integer, ["a"], []},
# {Print, :print, 2, [file: '/tmp/test.ex', line: 22]}
# ]}}
Now instead of printing this out, respawn the failed process, and you will get a bare implementation of the supervised intercommunicating processes. For all_for_one strategy that could be achieved with:
receive do
{:DOWN, _, _, _, _} ->
Process.exit(print, :normal)
Process.exit(proc1, :normal)
Process.exit(proc2, :normal)
start()
end
I want to test that messages that are broadcast when some background jobs are completed are actually appearing in the view.
I have unit tests for this which work fine. I would actually like to ensure that the JS gets run so that the view is updated with the correct message.
So far I have not been able to find any way to do this.
Here is the test I have where I would like to add the expectation for the broadcast message:
require 'rails_helper'
require 'sidekiq/testing'
RSpec.describe 'sending a quote request', js: true do
let(:quote_request_form) { build(:quote_request_form) }
before do
create(:job_rate, :proofreading)
create(:proofreader_with_work_events)
end
it 'shows the user their quotation' do
visit new_quote_request_path
fill_in("quote_request_form_name", with: quote_request_form.name)
fill_in("quote_request_form_email", with: quote_request_form.email)
attach_file('customFile','/Users/mitchellgould/RailsProjects/ProvenWordNew/spec/test_documents/quote_request_form/1.docx', make_visible: true)
click_on "Submit"
Sidekiq::Testing.inline! do
page.execute_script("$('#invisible-recaptcha-form').submit()")
expect(current_path).to eq(quote_confirm_path)
#add expectation here:
expect(page).to have_content("Calculating Time Required")
page.execute_script("window.location.pathname = '#{quotation_path(Quotation.first)}'")
expect(current_path).to eq(quotation_path(Quotation.first))
expect(page).to have_content("Here is your quotation")
end
end
end
Here is my .coffee file:
$(document).on 'turbolinks:load', ->
if $("meta[name='current_user']").length > 0
App.notification = App.cable.subscriptions.create "NotificationChannel",
connected: ->
# Called when the subscription is ready for use on the server
disconnected: ->
# Called when the subscription has been terminated by the server
received: (data) ->
$('.background_message').html(data.content)
if data.head == 302 && data.path
window.location.pathname = data.path
else if App.notification
App.quotation.unsubscribe()
delete App.notification
Here is one of the background jobs that broadcasts a message when its done:
class CreateParagraphDetailsJob < ApplicationJob
queue_as :default
after_perform :broadcast_message, :calculate_proofreading_job_duration
def perform(document, proofreading_job_id, current_user_id)
document.create_paragraph_details
end
private
def calculate_proofreading_job_duration
CalculateDurationJob.set(wait: 1.seconds).perform_later proofreading_job_id, current_user_id
end
def broadcast_message
ActionCable.server.broadcast "notification_channel_user_#{current_user_id}", content: "Analyzed writing quality of paragraphs"
end
def document
self.arguments.first
end
def proofreading_job_id
self.arguments.second
end
def current_user_id
self.arguments.last
end
end
Any ideas on how to do this?
I am trying to speed up a ruby algorithm. I have a rails app that uses active record and nokogiri to visit a list of urls in a database and scrape the main image from the page and save it under the image attribute associated with that url.
This rails task usually takes about 2:30 s to complete and I am trying to speed it up as a learning exercise. Would it be possible to use C through RubyInline and raw SQL code to achieve the desired result? My only issue is that if I use C I lose the database connection that active record with ruby had, and have no idea how to write SQL queries in conjunction with the C code that will properly connect to my db.
Has anyone had experience with this, or even know if it's possible? I'm doing this as primarily a learning exercise and was wondering whether it was even possible. Here is the code that I want to translate into C and SQL if you are interested:
task :getimg => :environment do
stories = FeedEntry.all
stories.each do |story|
if story.image.nil?
url = story.url
doc = Nokogiri::HTML(open(url))
if doc.at_css(".full-width img")
img = doc.at_css(".full-width img")[:src]
story.image = img
story.save!
elsif doc.at_css(".body-width img")
img = doc.at_css(".body-width img")[:src]
story.image = img
story.save!
elsif doc.at_css(".body-narrow-width img")
img = doc.at_css(".body-narrow-width img")[:src]
story.image = img
story.save!
elsif doc.at_css(".caption img")
img = doc.at_css(".caption img")[:src]
story.image = img
story.save!
elsif doc.at_css(".cnnArticleGalleryPhotoContainer img")
img = doc.at_css(".cnnArticleGalleryPhotoContainer img")[:src]
story.image = img
story.save!
elsif doc.at_css(".cnn_strylftcntnt div img")
img = doc.at_css(".cnn_strylftcntnt div img")[:src]
story.image = img
story.save!
elsif doc.at_css(".cnn_stryimg640captioned img")
img = doc.at_css(".cnn_stryimg640captioned img")[:src]
story.image = img
story.save!
end
else
#do nothing
end
end
end
I would appreciate any and all help and insights in this matter. Thank you in advance!!
Speed of DB Saving
I've written a web crawler in ruby and I found that one of the bottlenecks that can affect performance is the actual creation of the row in the database. It's faster to have a single mass insert at the end of extracting all URLs than to have multiple individual inserts (at-least for Postgres).
So instead of calling YourModel.save! for every url you visit, just push every url to an array that will keep track of url's that you need to save to the database. Then once you've finished scraping all links, do a mass insert of all the image links through an sql command.
stories.each do |story|
url = story.url
doc = Nokogiri::HTML(open(url))
img_url = doc.at_css("img")[:src]
to_insert.push "(#{img_url})"
end
#notice the mass insert at the end
sql = "INSERT INTO your_table (img_url) VALUES #{to_insert.join(", ")}"
#CONN is a constant declared at the top of your file (CONN = ActiveRecord::Base.connection)
#that connects to the database
CONN.execute sql
"Speed Up" Downloading
The downloading of links will also be a bottleneck. Thus, the best option would be to create a thread pool, where each thread is allocated a partition of URLs from the database to scrape. This way, you will never be stuck waiting for a single page to download before you do any real processing.
Some pseudoish ruby code:
number_of_workers = 10
(1..number_of_workers).each do |worker|
Thread.new do
begin
urls_to_scrape_for_this_thread = [...list of urls to scrape...]
while urls_to_scrape > 0
url = take_one_url_from_list
scrape(url)
end
rescue => e
puts "========================================"
puts "Thread # #{i} error"
puts "#{e.message}"
puts "#{e.backtrace}"
puts "======================================="
raise e
end
end
end
Are the URLs remote? if so, first benchmark it to see the network latency. If that's the bottleneck, I think you have nothing to do with your code or your choice of language.
How many FeedEntrys do you have in your database? I suggest using FeedEntry.find_each instead of FeedEntry.all.each, because the former loads 1000 entries into memory, processes them, and then loads the next 1000 entries ..., while the latter loads all entries into memory and then iterates over them, which requires more memory and increases GC cycles.
If the bottleneck is neither one of the above, then maybe it's the DOM node searching algorithm which is slow. You can find the (only one?) img node, then check its parent node or grandparent node if necessary, and update your entries accordingly.
image_node = doc.at_css('img')
story.update image: image_node['src'] if needed?(image_node)
def needed?(image_node)
parent_node = image_node.parent
parent_class = image_node.parent['class']
return true if parent_class == 'full-width'
return true if parent_class == 'body-width'
return true if parent_class == 'body-narrow-width'
return true if parent_class == 'caption'
return true if parent_class == 'cnnArticleGalleryPhotoContainer'
return true if parent_class == 'cnn_stryimg640captioned'
return false unless parent_node.node_type == 'div'
return true if parent_node.parent['class'] == 'cnn_strylftcntnt'
false
end
I would like to set some images without uploading. (They already exist, or another task saves them...)
If I try (in rails console):
user = User.last
user.picture = '/some/picture.jpg'
user.save
user.picture # nil
The only way to do that is to set remote_picture_url, and then delete the upload (which is stupid)
Is there any method in carrierwave that lets you modify only the filename ?
class User < ActiveRecord::Base
attr_accessible :picture
# Don't want to write to the database, but want to be able to check
attr_writer :skip
# set a default value
def skip
#skip ||= false
end
mount_uploader :image, PictureUploader
# Make sure that the skip callback comes after the mount_uploader
skip_callback :save, :before, :store_picture!, if: :skip_saving?
# Other callbacks which might be triggered depending on the usecase
#skip_callback :save, :before, :write_picture_identifier, id: :skip_saving?
def skip_saving?
skip
end
end
class PictureUploader < Carrierwave::Uploader::Base
# You could also implement filename=
def set_filename(name)
#filename = name
end
end
Assuming you have the setup above, in your console:
user = User.last
user.picture.set_filename('/some/picture.jpg')
user.skip = true
# Save will now skip the callback store_picture!
user.save
user.picture # /some/picture.jpg
It should be noted that if you're in the console and you update an existing record that has an attached file (ie user.picture.file) it will show the old url/location. If you quit the console (assuming you're not in sandbox mode) and come back and query the same object it will have the updated url/location.
For every request, I get this in the logs:
Completed 200 OK in 854ms (Views: 1.0ms | ActiveRecord: 17.0ms)
Is it possible to get it to also include the number of queries?
Something like:
Completed 200 OK in 854ms (Views: 1.0ms | ActiveRecord: 17.0ms | Queries: 10)
Ideally, I'd like all the "cached" ones to show up in that count too. Ie, even if the "cache" is saving me from "N+1" queries from hitting the DB, I still want to know I have a problem.
I'm fine with monkeypatching / manually editing something, since I really want this just for my dev box.
(If this can be made civilizedly so I can have it in production, that's even better, but if not, I'm fine with just having a manually modified Rails in my own machine)
Thanks!
Daniel
I know the ThinkingSphinx gem does something quite like this, adding the time spent running Sphinx queries to the summary in the log. You can probably do something similar ( maybe by making your own gem, since I bet other people would appreciate this functionality) to make the number of queries appear.
I haven't really looked hard at how it works, but it looks like modifications to ActionController and LogSubscriber are responsible:
lib/thinking_sphinx/action_controller.rb:
module ThinkingSphinx
module ActionController
extend ActiveSupport::Concern
protected
attr_internal :query_runtime
def cleanup_view_runtime
log_subscriber = ThinkingSphinx::ActiveRecord::LogSubscriber
query_runtime_pre_render = log_subscriber.reset_runtime
runtime = super
query_runtime_post_render = log_subscriber.reset_runtime
self.query_runtime = query_runtime_pre_render + query_runtime_post_render
runtime - query_runtime_post_render
end
def append_info_to_payload(payload)
super
payload[:query_runtime] = query_runtime
end
module ClassMethods
def log_process_action(payload)
messages, query_runtime = super, payload[:query_runtime]
messages << ("Sphinx: %.1fms" % query_runtime.to_f) if query_runtime
messages
end
end
end
end
lib/thinking_sphinx/active_record/log_subscriber.rb:
require 'active_support/log_subscriber'
module ThinkingSphinx
module ActiveRecord
class LogSubscriber < ActiveSupport::LogSubscriber
def self.runtime=(value)
Thread.current['thinking_sphinx_query_runtime'] = value
end
def self.runtime
Thread.current['thinking_sphinx_query_runtime'] ||= 0
end
def self.reset_runtime
rt, self.runtime = runtime, 0
rt
end
def initialize
super
#odd_or_even = false
end
def query(event)
self.class.runtime += event.duration
return unless logger.debug?
identifier = color('Sphinx Query (%.1fms)' % event.duration, GREEN, true)
query = event.payload[:query]
query = color query, nil, true if odd?
debug " #{identifier} #{query}"
end
def message(event)
return unless logger.debug?
identifier = color 'Sphinx', GREEN, true
message = event.payload[:message]
message = color message, nil, true if odd?
debug " #{identifier} #{message}"
end
def odd?
#odd_or_even = !#odd_or_even
end
def logger
return #logger if defined? #logger
self.logger = ::ActiveRecord::Base.logger
end
def logger=(logger)
#logger = logger
end
attach_to :thinking_sphinx
end
end
end
I hope this helps.