HTML TABLE data overflow from one page to another - html-table

I'm doing a dynamic Django template that fills a table with information and I'm currently running issues with data overflow from one page to another which distorts the layout for the header and footer elements
`
def create_render(data_header, data_body, data_footer=None):
"""
FunciĆ³n para crear el renderizado de las pantallas de header y body
Se neceista pasarle un diccionario con el header y uno con los del boddy y el base_url
"""
def get_page_body(boxes):
for box in boxes:
if box.element_tag == "body":
return box
return get_page_body(box.all_children())
URL_BASE=request.build_absolute_uri()
html_string = render_to_string(data_header["template"], data_header)
html = HTML(string=html_string, base_url=URL_BASE)
header = html.render()
footer = None
if data_footer is not None:
html_string = render_to_string(data_footer["template"], data_footer)
html = HTML(string=html_string, base_url=URL_BASE)
footer = html.render()
# Main template
html_string = render_to_string(data_body["template"], data_body)
html = HTML(string=html_string, base_url=URL_BASE)
main_doc = html.render()
exists_links = False
header_page = header.pages[0]
exists_links = exists_links or header_page.links
header_body = get_page_body(header_page._page_box.all_children())
header_body = header_body.copy_with_children(header_body.all_children())
exists_links_footer = False
footer_body = None
footer_page = None
if footer is not None:
footer_page = footer.pages[0]
exists_links_footer = exists_links_footer or footer_page.links
footer_body = get_page_body(footer_page._page_box.all_children())
footer_body = footer_body.copy_with_children(footer_body.all_children())
# Insert header and footer in main doc
for i, page in enumerate(main_doc.pages):
page_body = get_page_body(page._page_box.all_children())
page_body.children += header_body.all_children()
if exists_links:
page.links.extend(header_page.links)
if footer_body is not None:
page_body.children += footer_body.all_children()
if exists_links_footer:
page.links.extend(footer_page.links)
return main_doc
`
I've changed the CSS tags, such as word-break and word-wrap and height. I've also tried transforming the page. I've added the weasyprint function if it helps. I'm joining the elements and rendering all to one single html.

Related

Why does my web scraping function not export the data?

I am currently web scraping a few pages inside a list. I have the following code provided.
pages = {
"https://shop.supervalu.ie/shopping/wine-beer-spirits-germany/c-150410100",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-small-bottles/c-150410110",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302375", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302380",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302385",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302386",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302387",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302388", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302389",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302390",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-alcopops/c-150302395",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-vodka/c-150302430",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-irish-whiskey/c-150302435", #More than one page
}
products = []
prices = []
images = []
urls = []
def export_data():
logging.info("exporting data to pandas dataframe")
supervalu = pd.DataFrame({
'img_url' : images,
'url' : urls,
'product' : products,
'price' : prices
})
logging.info("sorting data by price")
supervalu.sort_values(by=['price'], inplace=True)
output_json = 'supervalu.json'
output_csv = 'supervalu.csv'
output_dir = Path('../../json/supervalu')
output_dir.mkdir(parents=True, exist_ok=True)
logging.info("exporting data to json")
supervalu.to_json(output_dir / output_json)
logging.info("exporting data to csv")
supervalu.to_csv(output_dir / output_csv)
def get_data(div):
raw_data = div.find_all('div', class_='ga-product')
raw_images = div.find_all('img')
raw_url = div.find_all('a', class_="ga-product-link")
product_data = [data['data-product'] for data in raw_data]
new_data = [d.replace("\r\n","") for d in product_data]
for name in new_data:
new_names = re.search(' "name": "(.+?)"', name).group(1)
products.append(new_names)
for price in new_data:
new_prices = re.search(' "price": ''"(.+?)"', price).group(1)
prices.append(new_prices)
for image in raw_images:
new_images = image['data-src']
images.append(new_images)
for url in raw_url:
new_url = url['href']
urls.append(new_url)
def scrape_page(next_url):
page = requests.get(next_url)
if page.status_code != 200:
logging.error("Page does not exist!")
exit()
soup = BeautifulSoup(page.content, 'html.parser')
get_data(soup.find(class_="row product-list ga-impression-group"))
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()
except:
logging.warning("No more next pages to scrape")
pass
for page in pages:
logging.info("Scraping page: {}".format(page))
scrape_page(page)
The main issue that appears is during the try exception handling of the next page. As not all of the pages provided have the the appropriate snippet, a ValueAttribute error will araise hence I have the aforementioned statement closed off in a try exception case. I want to skip the pages that don't have next page and scrape them regardless and continue looping the rest of the pages until a next page arises. All of the pages appear to be looped through but I never get the data exported. If I try the following code:
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
except:
logging.warning("No more next pages to scrape")
pass
else:
export_data()
This would be the closest that I have gotten to the desired outcome. The above code works and the data gets exported but not all of the pages get exported because as a result - a new dataframe is created for every time a new next page appears and ends i.e. - code iterarets through the list, finds a next page, next page 'pages' get scraped and a new dataframe is created and deletes the previous data.
I'm hoping that someone would give me some guidance on what to do as I have been stuck on this part of my personal project and I'm not so sure on how I am supposed to overcome this obstacle. Thank you in advance.
I have modified my code as shown below and I have received my desired outcome.
load_more_text = soup.find('a', class_='pill ajax-link load-more')
if load_more_text:
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()

How to add a second dataset to a plotly annotated heatmap?

I'm trying to create an annotated heatmap with a dropdown menu to switch between two different sets of data. The datasets have the same format and I have added a working dropdown menu. But I can only add one dataset at a time. I am using
fig = ff.create_annotated_heatmap(data, annotation_text=numbers, showscale=True, colorscale=colorscale, text=hover, hoverinfo='text')
to create the annotated heatmap. Is there a way to add a second dataset to switch between with the dropdown menu?
Resolved. Had to add the second data set to the args of the dropdown menu object
along with any other changes needed (such as hover text)
I just realized how easy it is to switch between two plots with a menu. You can just get the data from each figure to create a list of traces to swithc between
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
fig_1 = ff.create_annotated_heatmap(...)
fig_2 = ff.create_annotated_heatmap(...)
menu_items = ["Heatmap 1", "Heatmap 2"]
trace1 = fig_1.to_dict()["data"][0]
trace2 = fig_2.to_dict()["data"][0]
buttons = []
for i, menu_item in enumerate(menu_items):
visibility = [i==j for j in range(len(menu_items))]
button = dict(
label = menu_item,
method = 'update',
args = [{'visible': visibility},
{ 'title' : menu_item }])
buttons.append(button)
updatemenus = list([
dict(buttons = buttons)
])
layout = dict(updatemenus = updatemenus, title=menu_items[0])
fig = dict(data=[trace1, trace2], layout=layout)
iplot(fig)

Send Discord Embed of Product Stock Parsed form HTML with BS4 and Requests

So I have the code:
import requests
import discord
from bs4 import BeautifulSoup
class MyClient(discord.Client):
async def on_ready(self):
print('Logged on as {0}!'.format(self.user))
async def on_message(self, message):
if (message.channel.id == 678447420643868674):
if "test" in message.content:
r = requests.get('https://www.jimmyjazz.com/mens/footwear/adidas-solar-hu-nmd/BB9528')
soup = BeautifulSoup(r.text, 'html.parser')
embed = discord.Embed(color=0x00ff00)
embed.title = "test"
for anchor_tag in soup.find_all(class_="box_wrapper")[0].findChildren():
if "piunavailable" in anchor_tag['class']:
embed.description = f"Size {anchor_tag.text} OOS"
await message.channel.send(embed=embed)
else:
embed.description = f"Size {anchor_tag.text} in stock!"
await message.channel.send(embed=embed)
client = MyClient()
client.run('NjY2MDMyMDc0NjM3MTgwOTQ4.XkjBLg.I3dtsL2nkVh_bafTlycSwBApQfU')
And that sends the item stock as an embed for each size:
https://gyazo.com/7a7c868d00a99fc3798a3c24feb9ea7e
How would I change the code to make it send for every size in one embed instead of an embed per size?
Thanks :)
Embeds in discord can have field which you can add with the embed.add_field() function embed.add_field(name="Field1", value="hi", inline=False)
Embed have a few limits in size (copied from https://discordjs.guide/popular-topics/embeds.html#notes):
A field's name is limited to 256 characters and its value to 1024 characters
There can be up to 25 fields
In addition, the sum of all characters in an embed structure must not exceed 6000 characters
Due to this you will have to likely split your product stock into multiple embed when it exceeds 25 field or 6000 character by having a counter for both and if it goes over resetting and sending message.
Here is a part example (I've not tested it but logic should be correct)
r = requests.get('https://www.jimmyjazz.com/mens/footwear/adidas-solar-hu-nmd/BB9528')
soup = BeautifulSoup(r.text, 'html.parser')
charCount = 0
fieldCount = 0
embed = discord.Embed(color=0x00ff00)
embed.title = "test"
for anchor_tag in soup.find_all(class_="box_wrapper")[0].findChildren():
anchor_text = anchor_tag.text
charCount += len(anchor_text)
if charCount >=6000 or fieldCount >=25:
charCount = len(anchor_text)
fieldCount = 0
await message.channel.send(embed=embed)
embed = discord.Embed(color=0x00ff00)
embed.title = "test"
if "piunavailable" in anchor_tag['class']:
embed.add_field(name= f"Size {anchor_text}", value="Out Of Stock")
else:
embed.add_field(name= f"Size {anchor_text}", value="In stock!")
fieldCount +=1
Append as such:
for anchor_tag in soup.find_all(class_="box_wrapper")[0].findChildren():
if "piunavailable" in anchor_tag['class']:
embed.add_field(name= f"Size {anchor_tag.text}", value=":x:", inline= 'true')
else:
embed.add_field(name= f"Size {anchor_tag.text}", value=f":white_check_mark: | ATC", inline= 'true')
await message.channel.send(embed=embed)

How to fixing different element text to extract

I'm setting up a new scrapy spider and developed
I am using windows 10 and it's running.
My problem is extracting text from different element. This elements sometime on (strong tag, p,) sometime have class , sometime have id but i need to implement to one element to extracting a row text.
Please checkout the link of site
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=404&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193193&fromFeatured=1
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=0&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=202434
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=1218&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193194&fromFeatured=1
https://prnt.sc/nkl1vc,
https://prnt.sc/nkl1zy,
https://prnt.sc/nkl247,
# -*- coding: utf-8 -*-
import scrapy
class OtcnetSpider(scrapy.Spider):
name = 'otcnet'
# allowed_domains = ['otcnet.org']
start_urls = ['https://exhibits.otcnet.org/otc2019/Public/Exhibitors.aspx?Index=All&ID=26006&sortMenu=107000']
def parse(self, response):
links = response.css('a.exhibitorName::attr(href)').extract()
for link in links:
ab_link = response.urljoin(link)
yield scrapy.Request(ab_link, callback=self.parse_p)
def parse_p(self, response):
url = response.url
Company = response.xpath('//h1/text()').extract_first()
if Company:
Company = Company.strip()
Country = response.xpath('//*[#class="BoothContactCountry"]/text()').extract_first()
State = response.xpath('//*[#class="BoothContactState"]/text()').extract_first()
if State:
State = State.strip()
Address1 = response.xpath('//*[#class="BoothContactAdd1"]/text()').extract_first()
City = response.xpath('//*[#class="BoothContactCity"]/text()').extract_first()
if City:
City = City.strip()
zip_c = response.xpath('//*[#class="BoothContactZip"]/text()').extract_first()
Address = str(Address1)+' '+str(City)+' '+str(State)+' '+str(zip_c)
Website = response.xpath('//*[#id="BoothContactUrl"]/text()').extract_first()
Booth = response.css('.eBoothControls li:nth-of-type(1)::text').extract_first().replace('Booth: ','')
Description = ''
Products = response.css('.caption b::text').extract()
Products= ', '.join(Products)
vid_bulien = response.css('.aa-videos span.hidden-md::text').extract_first()
if vid_bulien=="Videos":
vid_bulien = "Yes"
else:
vid_bulien = "No"
Video_present = vid_bulien
Conference_link = url
Categories = response.css('.ProductCategoryLi a::text').extract()
Categories = ', '.join(Categories)
Address = Address.replace('None','')
yield {
'Company':Company,
'Country':Country,
'State':State,
'Address':Address,
'Website':Website,
'Booth':Booth,
'Description':Description,
'Products':Products,
'Video_present':Video_present,
'Conference_link':Conference_link,
'Categories':Categories
}
I expect the output would be a row description from different element
According to this post and excellent #dimitre-novatchev answer you need to find a node-set intersection:
$ns1 for your page is:
//p[#class="BoothProfile"]/following-sibling::p
$ns2 is:
p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p
as a result you need to process these p elements:
//p[#class="BoothProfile"]/following-sibling::p[count(.|//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p) = count(//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p)]
You can use this Scrapy code:
for p_elem in response.xpath('//p[#class="BoothProfile"]/following-sibling::p[count(.|//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p) = count(//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p)]'):
# using string() to stringify <p>
Description += p_elem.xpath('string(.)').extract_first()

Difficulty in setting values of header from top and footer from bottom using Apache POI

I can not find any direct Apache POI code for setting the values of header from top and footer from bottom in a word file.Please give me some suggestion for setting these values using java code.
I too have been looking for the answer to this question for quite some time and came across this answer but sadly forget where.
I hope this helps someone too.
import java.io.*;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPageMar;
import java.math.BigInteger;
public class CreateWordHeaderFooterSpacing {
public static void main(String[] args) throws Exception {
XWPFDocument document = new XWPFDocument();
// create header-footer
XWPFHeaderFooterPolicy headerFooterPolicy = document.getHeaderFooterPolicy();
if (headerFooterPolicy == null) headerFooterPolicy = document.createHeaderFooterPolicy();
// create footer start
XWPFFooter footer = headerFooterPolicy.createFooter(XWPFHeaderFooterPolicy.DEFAULT);
XWPFParagraph paragraph = footer.getParagraphArray(0);
paragraph.setAlignment(ParagraphAlignment.CENTER);
XWPFRun run = paragraph.createRun();
run.setText("Footer");
CTSectPr sectPr = document.getDocument().getBody().getSectPr();
if (sectPr == null) sectPr = document.getDocument().getBody().addNewSectPr();
CTPageMar pageMar = sectPr.getPgMar();
if (pageMar == null) pageMar = sectPr.addNewPgMar();
pageMar.setLeft(BigInteger.valueOf(720)); //720 TWentieths of an Inch Point (Twips) = 720/20 = 36 pt = 36/72 = 0.5"
pageMar.setRight(BigInteger.valueOf(720));
pageMar.setTop(BigInteger.valueOf(1440)); //1440 Twips = 1440/20 = 72 pt = 72/72 = 1"
pageMar.setFooter(BigInteger.valueOf(720)); //0.5" footer margin
long notPrintableBottomPageRange = (long)(0.038888*72*20); //0.038888" gap for non printable bottom page range
pageMar.setBottom(BigInteger.valueOf(1152+720+notPrintableBottomPageRange)); //1152 Twips = 1152/20/72 = 0.8"
//bottom margin = 0.8" footer spacing + 0.5" footer margin + 0.038888" gap for non printable bottom page range
document.write(new FileOutputStream("CreateWordHeaderFooterSpacing.docx"));
document.close();
}
}