django oscar invoice pdf generation - pdf

Hi I am trying to set up invoice generation for django oscar.
I have found a useful link https://groups.google.com/forum/#!topic/django-oscar/sg1qtyuu32Q
(two main links from this google groups are
https://gist.github.com/elbaschid/8722203 and https://gist.github.com/elbaschid/8776935)
but I am having an issue trying to set this up.
I saved my templates as suggested in the link and the below is the code under OrderListView:
def generate_packing_slips(self, request, orders):
template = loader.get_template(self.packing_slip_template)
main_pdf = pisaPDF()
for order in orders:
voucher_codes = []
for discount in order.discounts.all():
if discount.voucher_code:
voucher_codes.append(discount.voucher_code)
context = RequestContext(request, {
'order': order,
'STATIC_ROOT': settings.STATIC_ROOT,
'voucher_codes': voucher_codes,
})
html = template.render(context)
result = StringIO()
order_pdf = pisa.pisaDocument(StringIO(html.encode("UTF-8")), result)
if order_pdf.err:
messages.error(
self.request,
_("An problem occured trying to generate the packing slip for "
"order #%s") % order.number,
)
else:
main_pdf.addDocument(order_pdf)
response = HttpResponse(main_pdf.getvalue(), mimetype='application/pdf')
filename = self.get_packing_slip_filename(orders)
response['Content-Disposition'] = 'attachment; filename=%s' % filename
return response
However this gives me an error of
AttributeError at /dashboard/orders/
'OrderListView' object has no attribute 'packing_slip_template'
The path for the template is 'dashboard/orders/order_packing_slip.html'
I also added a button on order_list.html by adding the below code:
<button style="margin-right:10px" type="submit" class="btn w3-black w3-hover-dark-gray" name="action" value="generate_packing_slips" data-loading-text="{% trans 'Submitting...' %}">{% trans "Download" %}</button>
I would be grateful if you have any suggestions for the solutions!
Many Thanks

Related

Why does my web scraping function not export the data?

I am currently web scraping a few pages inside a list. I have the following code provided.
pages = {
"https://shop.supervalu.ie/shopping/wine-beer-spirits-germany/c-150410100",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-small-bottles/c-150410110",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302375", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302380",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302385",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302386",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302387",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302388", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302389",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302390",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-alcopops/c-150302395",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-vodka/c-150302430",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-irish-whiskey/c-150302435", #More than one page
}
products = []
prices = []
images = []
urls = []
def export_data():
logging.info("exporting data to pandas dataframe")
supervalu = pd.DataFrame({
'img_url' : images,
'url' : urls,
'product' : products,
'price' : prices
})
logging.info("sorting data by price")
supervalu.sort_values(by=['price'], inplace=True)
output_json = 'supervalu.json'
output_csv = 'supervalu.csv'
output_dir = Path('../../json/supervalu')
output_dir.mkdir(parents=True, exist_ok=True)
logging.info("exporting data to json")
supervalu.to_json(output_dir / output_json)
logging.info("exporting data to csv")
supervalu.to_csv(output_dir / output_csv)
def get_data(div):
raw_data = div.find_all('div', class_='ga-product')
raw_images = div.find_all('img')
raw_url = div.find_all('a', class_="ga-product-link")
product_data = [data['data-product'] for data in raw_data]
new_data = [d.replace("\r\n","") for d in product_data]
for name in new_data:
new_names = re.search(' "name": "(.+?)"', name).group(1)
products.append(new_names)
for price in new_data:
new_prices = re.search(' "price": ''"(.+?)"', price).group(1)
prices.append(new_prices)
for image in raw_images:
new_images = image['data-src']
images.append(new_images)
for url in raw_url:
new_url = url['href']
urls.append(new_url)
def scrape_page(next_url):
page = requests.get(next_url)
if page.status_code != 200:
logging.error("Page does not exist!")
exit()
soup = BeautifulSoup(page.content, 'html.parser')
get_data(soup.find(class_="row product-list ga-impression-group"))
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()
except:
logging.warning("No more next pages to scrape")
pass
for page in pages:
logging.info("Scraping page: {}".format(page))
scrape_page(page)
The main issue that appears is during the try exception handling of the next page. As not all of the pages provided have the the appropriate snippet, a ValueAttribute error will araise hence I have the aforementioned statement closed off in a try exception case. I want to skip the pages that don't have next page and scrape them regardless and continue looping the rest of the pages until a next page arises. All of the pages appear to be looped through but I never get the data exported. If I try the following code:
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
except:
logging.warning("No more next pages to scrape")
pass
else:
export_data()
This would be the closest that I have gotten to the desired outcome. The above code works and the data gets exported but not all of the pages get exported because as a result - a new dataframe is created for every time a new next page appears and ends i.e. - code iterarets through the list, finds a next page, next page 'pages' get scraped and a new dataframe is created and deletes the previous data.
I'm hoping that someone would give me some guidance on what to do as I have been stuck on this part of my personal project and I'm not so sure on how I am supposed to overcome this obstacle. Thank you in advance.
I have modified my code as shown below and I have received my desired outcome.
load_more_text = soup.find('a', class_='pill ajax-link load-more')
if load_more_text:
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()

BeautifulSoup.find_all does not return the class shown by "Inspect" under "div"

Goal: Query all the job records as a collection as part of job website scraping.
Steps: There are 100 job records and using "Inspect" in Google Chrome, it shows up as follows, when "Inspect"ing a single job record.
<div class="coveo-list-layout CoveoResult">
<div class="coveo-result-frame item-wrap">
<div class="content-main">
<div class="coveo-result-cell content-wrap">
Problem: The following code does not return the count as 100, it is just 0. All of the above mentioned class were used in find_all, but it does not return 100 records. Attached a snip of the "Inspect" to show the class associated with a single record. Output of Inspect on a single job record:
response = requests.get(url)
print(response)
<Response [200]>
response.reason
'OK'
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div','content-list-layout CoveoResult')
len(cards)
0
cards = soup.find_all('div')
len(cards)
86
Code tried as follows: None of them work
cards = soup.find_all('div','content-list-layout CoveoResult')
cards = soup.find_all('div','content-list-layout')
cards = soup.find_all('div','coveo-result-frame item-wrap')
cards = soup.find_all('div','coveo-result-frame')
cards = soup.find_all('div','content-main')
cards = soup.find_all('div','coveo-result-cell content-wrap')
cards = soup.find_all('div','coveo-result-cell')
Next Steps: Need help with finding the class associated with a single record. As a debug I have generated the output of "cards = soup.

Better way to clean product description using BeautifulSoup?

I have written following code to fetch product description from a site using BeautifulSoup-
def get_soup(url):
try:
response = requests.get(url)
if response.status_code == 200:
html = response.content
return BeautifulSoup(html, "html.parser")
except Exception as ex:
print("error from " + url + ": " + str(ex))
def get_product_details(url):
try:
soup = get_soup(url)
prod_details = dict()
desc_list = soup.select('p ~ ul')
prod_details['description'] = ''.join(desc_list)
return prod_details
except Exception as ex:
logger.warning('%s - %s', ex, url)
if __name__ == '__main__':
get_product_details("http://www.aprisin.com.sg/p-748-littletikespoptunesguitar.html")
In above code I am trying to convert description(a list) to string but getting below issue-
[WARNING] aprisin.py:82 get_product_details() : sequence item 0: expected str instance, Tag found - http://www.aprisin.com.sg/p-748-littletikespoptunesguitar.html
Output of description without converting description to string-
[<ul>
<li>Freestyle</li>
<li>Play along with 5 pre-set tunes: </li>
</ul>, <ul>
<li>Each string will play a note</li>
<li>Guitar has a whammy bar</li>
<li>2-in-1 volume control and power button </li>
<li>Simple and easy to use </li>
<li>Helps develop music appreciation </li>
<li>Requires 3 "AA" alkaline batteries (included)</li>
</ul>]
You are passing a list of tags (Object) instead of string to join(). join() works with list of strings. Use the following code changes for join function:-
prod_details['description'] = ''.join([tag.get_text() for tag in desc_list])
or
prod_details['description'] = ''.join([tag.string for tag in desc_list])
In case you want the description along with html content, you can use the following:-
# this will preserve the html tags and indentation.
prod_details['description'] = ''.join([tag.prettify() for tag in desc_list])
or
# this will return the html content as string.
prod_details['description'] = ''.join([str(tag) for tag in desc_list])
desc_list is list of bs4.element.Tag. you should convert tag to string:
desc_list = soup.select('p ~ ul')
prod_details['description'] = str(desc_list[0])
You're trying to join a list of Tags, but the join method needs str arguments. Try:
''.join([str(i) for i in desc_list])

Scrapy only show the first result of each page

I need to scrape the items of the first page and then go to the next button to go to the second page and scrape and so on.
This is my code, but only scrape the first item of each page, if there are 20 pages enter to every page and scrape only the first item.
Could anyone please help me .
Thank you
Apologies for my english.
class CcceSpider(CrawlSpider):
name = 'ccce'
item_count = 0
allowed_domain = ['www.example.com']
start_urls = ['https://www.example.com./afiliados value=&categoria=444&letter=']
rules = {
# Reglas Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//li[#class="pager-next"]/a')), callback = 'parse_item', follow = True),
}
def parse_item(self, response):
ml_item = CcceItem()
#info de producto
ml_item['nombre'] = response.xpath('normalize-space(//div[#class="news-col2"]/h2/text())').extract()
ml_item['url'] = response.xpath('normalize-space(//div[#class="website"]/a/text())').extract()
ml_item['correo'] = response.xpath('normalize-space(//div[#class="email"]/a/text())').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//div[#class="news-col4"]/text())').extract()
self.item_count += 1
if self.item_count > 5:
#insert_table(ml_item)
raise CloseSpider('item_exceeded')
yield ml_item
As you haven't given an working target url, I'm a bit guessing here, but most probably this is the problem:
parse_item should be a parse_page (and act accordingly)
Scrapy is downloading a full page which has - according to your description - multiple items and then passes this as a response object to your parse method.
It's your parse method's responsibility to process the whole page by iterating over the items displayed on the page and creating multiple scraped items accordingly.
The scrapy documentation has several good examples for this, one is here: https://doc.scrapy.org/en/latest/topics/selectors.html#working-with-relative-xpaths
Basically your code structure in def parse_XYZ should look like this:
def parse_page(self, response):
items_on_page = response.xpath('//...')
for sel_item in items_on_page:
ml_item = CcceItem()
#info de producto
ml_item['nombre'] = # ...
# ...
yield ml_item
Insert the right xpaths for getting all items on the page and adjust your item xpaths and you're ready to go.

Scraping tabulated paginated data

A page with data I need has changed its structure to a new paginated format. I'm working on updating my scraper for the page.
I can't understand how to collect the data from all of the different pages.
The page to be scraped is: http://eserver.goutsi.com:8080/DPW230.cgi
I know how to collect the data in the tables but I can't figure out how to handle the pagination.
This is my original script:
scrape_actor = Mechanize.new
page = scrape_actor.get("http://loads.goutsi.com:8080/wntv5/BKLoad")
rows = page.body.to_s.split("</tr>")
rows.each do |row|
if row.include? "bgcolor='#f5f5f5'"
columns = row.split("</td>")
i = 0
while i < columns.count
columns[i] = columns[i].gsub(%r{</?[^>]+?>},'').gsub(/[\n\t\r ]+/,'').gsub(" ",'')
i+=1
end
username = "UTSI"
origin = columns[0].gsub(" ","")
pickup = Chronic.parse(columns[1]+"/"+Time.now.strftime("%Y"))
dest = columns[3]
comments = "miles: #{columns[4]}, phone: #{columns[9]}, other: #{columns[11]}"
equipment = columns[6]
ltl = false
ltl = true if columns[7] == "LTL"
Scrape.post_load(username,origin,dest,pickup,'',ltl,equipment,comments,'','','')
end
end
When you click one of the page links javascript on the page triggers a post request to the same path with the new page number.
This can be found in their js file at http://eserver.goutsi.com:8080/js/LoadBoard.js
function gotoPage(pageNumber)
{
document.getElementById("PageNbr").value= pageNumber; // Set new page number
document.getElementById("PageDir").value="R"; // Refresh
document.getElementById("theForm").submit();
}
That code submits this form:
<form action="/DPW230.cgi" method="post" id="theForm">...
Which has the field:
<input type="hidden" id="PageDir" name="PageDir" value=" "><input type="hidden" id="PageNbr" name="PageNbr" value="1">
This value for which is updated in that same javascript.
This means you need to make post requests to this URL with consecutive page numbers are params - then parse each page in turn aggregating the results.