scrapy urljoin inconsistent and incomplete? - scrapy

I'm trying to get all the xml file links from this domain. When I use the scrapy shell, I get the relative link I am expecting.
>>> response.xpath('//div[#class="toolbar"]/a[contains(#href, ".xml")]/#href').extract()[1]
'/dhq/vol/16/3/000642.xml'
But when I try to yield all the links, I end up with a csv that has all incomplete links or just the root link many times over.
Example dataset: https://pastebin.com/JqCKnxV5
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DhqSpider(CrawlSpider):
name = 'dhq'
allowed_domains = ['digitalhumanities.org']
start_urls = ['http://www.digitalhumanities.org/dhq/vol/16/3/index.html']
rules = (
Rule(LinkExtractor(allow = 'index.html')),
Rule(LinkExtractor(allow = 'vol'), callback='parse_xml'),
)
def parse_xml(self, response):
xmllinks = response.xpath('//div[#class="toolbar"]/a[contains(#href, ".xml")]/#href').extract()[1]
for link in xmllinks:
yield{
'file_urls': [response.urljoin(link)]
}
What am I missing in my urljoin that's creating these incomplete and/or root links?

CrowlSpider scrapes data from each of the detail page and your selection select two elements but you have to select only one, then you can apply the built-in indexing of xpath expression to avoid unnecessary for loop.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DhqSpider(CrawlSpider):
name = 'dhq'
allowed_domains = ['digitalhumanities.org']
start_urls = ['http://www.digitalhumanities.org/dhq/vol/16/3/index.html']
rules = (
Rule(LinkExtractor(allow = 'index.html')),
Rule(LinkExtractor(allow = 'vol'), callback='parse_xml'),
)
def parse_xml(self, response):
xmllink = response.xpath('(//div[#class="toolbar"]/a[contains(#href, ".xml")]/#href)[1]').get()
yield{
'file_urls': response.urljoin(xmllink)
}
Output:
{'file_urls': 'http://www.digitalhumanities.org/dhq/vol/12/1/000355.xml'}
2022-12-14 20:28:58 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.digitalhumanities.org/dhq/vol/12/1/000346/000346.html> (referer: http://www.digitalhumanities.org/dhq/vol/12/1/index.html)
2022-12-14 20:28:58 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.digitalhumanities.org/dhq/vol/12/1/000346/000346.html>
{'file_urls': 'http://www.digitalhumanities.org/dhq/vol/12/1/000346.xml'}
2022-12-14 20:29:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.digitalhumanities.org/dhq/vol/12/1/000362/000362.html> (referer: http://www.digitalhumanities.org/dhq/vol/12/1/index.html)
2022-12-14 20:29:03 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.digitalhumanities.org/dhq/vol/12/1/000362/000362.html>
{'file_urls': 'http://www.digitalhumanities.org/dhq/vol/12/1/000362.xml'}
2022-12-14 20:29:03 [scrapy.core.engine] INFO: Closing spider (finished)
2022-12-14 20:29:03 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 242004,
'downloader/request_count': 754,
'downloader/request_method_count/GET': 754,
'downloader/response_bytes': 69368110,
'downloader/response_count': 754,
'downloader/response_status_count/200': 754,
'dupefilter/filtered': 3221,
'elapsed_time_seconds': 51.448049,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 12, 14, 14, 29, 3, 317586),
'item_scraped_count': 697,
... so on
UPDATE:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DhqSpider(CrawlSpider):
name = 'dhq'
allowed_domains = ['digitalhumanities.org']
start_urls = ['http://www.digitalhumanities.org/dhq/vol/16/3/index.html']
rules = (
Rule(LinkExtractor(allow = 'index.html')),
Rule(LinkExtractor(allow = 'vol'), callback='parse_xml'),
)
def parse_xml(self, response):
#xmllink = response.xpath('(//div[#class="toolbar"]/a[contains(#href, ".xml")]/#href)[1]').get()
#'file_urls': response.urljoin(xmllink)
yield {
'title' : response.css('h1.articleTitle::text').get().strip().replace('\n', ' ').replace('\t',''),
'author' : response.css('div.author a::text').get().strip(),
'pubinfo' : response.css('div#pubInfo::text').getall(),
'xmllink' :response.urljoin( response.xpath('(//div[#class="toolbar"]/a[contains(#href, ".xml")]/#href)[1]').get()),
#'referrer_url' : response.url
}
OUTPUT:
{
"title": "Textension: Digitally Augmenting Document Spaces in Analog Texts",
"author": "Adam James Bradley",
"pubinfo": [
"2019",
"Volume 13 Number 3"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/13/3/000426.xml"
},
{
"title": "Building the",
"author": "Cait Coker",
"pubinfo": [
"2019",
"Volume 13 Number 3"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/13/3/000428.xml"
},
{
"title": "Dendrography and Art History: a computer-assisted analysis of Cézanne’s",
"author": "Melinda Weinstein",
"pubinfo": [
"2019",
"Volume 13 Number 3"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/13/3/000423.xml"
},
{
"title": "The Invisible Work of the Digital Humanities Lab: Preparing Graduate Students for Emergent Intellectual and Professional Work",
"author": "Dawn Opel",
"pubinfo": [
"2019",
"Volume 13 Number 2"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/13/2/000421.xml"
},
{
"title": "Modelling Medieval Hands: Practical OCR for Caroline Minuscule",
"author": "Brandon W. Hawk",
"pubinfo": [
"2019",
"Volume 13 Number 1"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/13/1/000412.xml"
},
{
"title": "Introduction: Questioning",
"author": "Tarez Samra Graban",
"pubinfo": [
"2019",
"Volume 13 Number 2"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/13/2/000416.xml"
},
{
"title": "Racism in the Machine: Visualization Ethics in Digital Humanities Projects",
"author": "Katherine Hepworth",
"pubinfo": [
"2018",
"Volume 12 Number 4"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/12/4/000408.xml"
},
{
"title": "Narrelations — Visualizing Narrative Levels and their Correlations with Temporal Phenomena",
"author": "Hannah Schwan",
"pubinfo": [
"2019",
"Volume 13 Number 3"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/13/3/000414.xml"
},
{
"title": "Towards 3D Scholarly Editions: The Battle of Mount Street Bridge",
"author": "Costas Papadopoulos",
"pubinfo": [
"2019",
"Volume 13 Number 1"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/13/1/000415.xml"
},
{
"title": "Visual Communication and the promotion of Health: an exploration of how they intersect in Italian education",
"author": "Viviana De Angelis",
"pubinfo": [
"2018",
"Volume 12 Number 4"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/12/4/000407.xml"
},
{
"title": "Best Practices: Teaching Typographic Principles to Digital Humanities Audiences",
"author": "Amy Papaelias",
"pubinfo": [
"2018",
"Volume 12 Number 4"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/12/4/000405.xml"
},
{
"title": "Placing Graphic Design at the Intersection of Information Visualization Fields",
"author": "Yvette Shen",
"pubinfo": [
"2018",
"Volume 12 Number 4"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/12/4/000406.xml"
},
{
"title": "Making and Breaking: Teaching Information Ethics through Curatorial Practice",
"author": "Christina Boyles",
"pubinfo": [
"2018",
"Volume 12 Number 4"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/12/4/000404.xml"
},
{
"title": "Critically engaging with data visualization through an information literacy framework",
"author": "Steven Braun",
"pubinfo": [
"2018",
"Volume 12 Number 4"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/12/4/000402.xml"
},
{
"title": "Renaissance Remix.",
"author": "Deanna Shemek",
"pubinfo": [
"2018",
"Volume 12 Number 4"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/12/4/000400.xml"
},
{
"title": "Crowdsourcing Image Extraction and Annotation: Software Development and Case Study",
"author": "Ana Jofre",
"pubinfo": [
"2020",
"Volume 14 Number 2"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/14/2/000469.xml"
},
{
"title": "Defining scholarly practices, methods and tools in the Lithuanian digital humanities research community",
"author": "Ingrida Kelpšienė",
"pubinfo": [
"2018",
"Volume 12 Number 4"
],
"xmllink": "http://www.digitalhumanities.org/dhq/vol/12/4/000401.xml"
}
]

Related

Power Automate HTTP Request POST ends with "Internal Server Error"

I'm not sure why I face an "Internal Server Error" when I send an POST Request with Power Automate, but if I send the same request with Postman it works just fine. As you can see the flow tried 4 times until it stopped. I call the same REST API with Postman and it works without problems. The Authorization is by the way Basic with Username and Password.
This is the response from Postman when I send the POST request:
{
"executionTime": 3360,
"hints": [
{
"id": "STRING",
"name": "STRING",
"type": "GROUP_HINT",
"locations": [
{
"tokens": [
{
"start": 33,
"end": 41
},
{
"start": 61,
"end": 66
}
],
"score": 1.0,
"distance": 0,
"groupValue": {
"left": "STRING",
"right": "STRING"
}
}
],
"groupElement": [
"MATCHED GROUPS"
],
"range": 20
}
],
"businessCases": [
{
"businessCaseId": "ee05f8f9-e18d-4bf7-b7db-4c8bc800615a",
"name": "STRING",
"score": 0.53,
"hintCount": 1596,
"originCount": 241,
"matchedHints": [
"MATCHED GROUPS"
],
"matchedBoostedHints": [
"MATCHED GROUPS"
],
"origins": [],
"matchedOrigins": [
"MATCHED GROUPS"
],
"matchedBoostedOrigins": [
"MATCHED GROUPS"
],
"unmatchedOrigins": []
}
],
"questions": [],
"extractedValues": []
}
The REST API should analyse the text in the POST request and return any matched groups through sentiment analysis.

Couchbase query filtering by tag in an array

I have a author bucket. And in this bucket I keep the author infos and author's articles. I want to select the articles that have the tags I want from the author bucket.
I have tried this but I could not find how to do the filtering.
SELECT art.* FROM author AS a
UNNEST a.articles AS art
WHERE art.tags = 'History'
This is author bucket:
{
"about": {
"name": "sassa",
"userName": "sassatur"
},
"articles": [
{
"authorId": [
"8c7ba33e-0674-4d99-bfad-29d144028bc9"
],
"claps": [],
"comments": [],
"content": {
"articleType": "HTML",
"data": "My First Article"
},
"id": "71d6fa22-61be-4a93-8e86-8d569080da97",
"publishStatus": "UNLISTED",
"statistic": {
"articleId": "71d6fa22-61be-4a93-8e86-8d569080da97",
"views": [
1602683127039,
1602683148270
]
},
"tags": [
"Art, History"
],
"title": "Culture"
},
{
"authorId": [
"8c7ba33e-0674-4d99-bfad-29d144028bc9"
],
"claps": [],
"comments": [],
"content": {
"articleType": "HTML",
"data": "My First Article"
},
"id": "81d6fa22-63be-4a93-8e86-8d569080da97",
"publishStatus": "UNLISTED",
"statistic": {
"views": [
1602683127039,
1602683148270
]
},
"tags": [
"Art"
],
"title": "Culture"
}
],
"id": "8c7ba33e-0674-4d99-bfad-29d144028bc9",
}
Try using ANY/IN/SATISFIES, like so:
SELECT art.* FROM author AS a
UNNEST a.articles AS art
WHERE ANY x IN art.tags SATISFIES x == 'Art' END;
This works for 'Art' in your example, but not 'History' because of the way you are storing tags. It's an array, but it appears to have a single(?) item with comma-separated values. So, instead of "tags": ["Art,History"], I would recommend: "tags": ["Art","History"] instead, and then it will work.
However, if you are stuck with the comma-separate string, you can use SPLIT and ARRAY_CONTAINS as well:
SELECT art.* FROM author AS a
UNNEST a.articles AS art
WHERE ANY x IN art.tags SATISFIES ARRAY_CONTAINS(SPLIT(x,", "), 'History') END;

DRF: how to join many-to-many field into a string in json

models.py
class SalesPerson(models.Model):
name = models.CharField(max_length=50)
office = models.ForeignKey(Location, on_delete=models.SET_NULL, null=True)
class Project(models.Model):
title = models.CharField(max_length=50)
leader = models.ForeignKey(SalesPerson, on_delete=models.SET_NULL, null=True,related_name='leader')
location = models.ForeignKey(Location, on_delete=models.SET_NULL, null=True)
product = models.ManyToManyField(Product)
sales_person = models.ManyToManyField(SalesPerson)
serializers.py
class ProjectSerializer(serializers.ModelSerializer):
leader_name= serializers.ReadOnlyField(source='leader.name')
location_name= serializers.ReadOnlyField(source='location.name')
product = serializers.SlugRelatedField(read_only=True, slug_field='name', many=True)
sales_person = serializers.SlugRelatedField(read_only=True, slug_field='name', many=True)
class Meta:
model = Project
fields = ('id', 'title', 'leader_name', 'location_name', 'product', 'sales_person')
class SPSerializer(serializers.ModelSerializer):
projects = ProjectSerializer(many=True, read_only=True, source='project_set')
office_name= serializers.ReadOnlyField(source='office.city')
class Meta:
model = SalesPerson
fields = ('id', 'name', 'office_name', 'email', 'projects')
result:
{
"id": 2,
"name": "Angela",
"office_name": "NSW Sydney",
"email": "angela#angela.com",
"projects": [
{
"id": 1,
"title": "Mall Orchid",
"leader_name": "Boris",
"product": [
"Split wall mounted"
],
"sales_person": [
"Angela",
"Boris",
"David"
],
},
{
"id": 6,
"title": "Mall Petunia",
"leader_name": "Boris",
"product": [
"Split duct"
],
"sales_person": [
"Angela",
"Boris",
"David"
],
},
]
},
I am going to consume the json using react native
I know how to iterate over "projects"
However, I want to avoid iterating over "sales_person" to make rendering the array simpler
So I am sure I have to make the sales_person into a string but after googling for many hours today, I can't find an answer
I am hoping to do this in Django Rest Framework and not in Expo React Native, if possible
So in short, I want to make the result like this:
{
"id": 2,
"name": "Angela",
"office_name": "NSW Sydney",
"email": "angela#angela.com",
"projects": [
{
"id": 1,
"title": "Mall Orchid",
"leader_name": "Boris",
"product": [
"Split wall mounted"
],
"sales_person": "Angela", "Boris", "David",
},
{
"id": 6,
"title": "Mall Petunia",
"leader_name": "Boris",
"product": [
"Split duct"
],
"sales_person": "Angela", "Boris", "David",
},
]
},
Thank you so much for your help.
I've found the answer
I am writing it for my own future reference
class ProjectSerializer(serializers.ModelSerializer):
leader_name= serializers.ReadOnlyField(source='leader.name')
location_name= serializers.ReadOnlyField(source='location.city')
product = serializers.SlugRelatedField(read_only=True, slug_field='name', many=True)
sales_person = serializers.SerializerMethodField('get_sales_person')
class Meta:
model = Project
fields = ('id', 'title', 'leader_name', 'location_name', 'product', 'sales_person')
def get_sales_person(self, obj):
return ', '.join([sales_person.name for sales_person in obj.sales_person.all()])

How to store JSON data in a meaningful way in Oracle

Using Twitter API, I can get tweets like this :
{
"coordinates": null,
"created_at": "Mon Sep 24 03:35:21 +0000 2012",
"id_str": "250075927172759552",
"entities": {
"urls": [
],
"hashtags": [
{
"text": "freebandnames",
"indices": [
20,
34
]
}
],
"user_mentions": [
]
},
"in_reply_to_user_id_str": null,
"contributors": null,
"text": "Aggressive Ponytail #freebandnames",
"metadata": {
"iso_language_code": "en",
"result_type": "recent"
},
"retweet_count": 0,
"profile_background_color": "C0DEED",
"verified": false,
"geo_enabled": true,
"time_zone": "Pacific Time (US & Canada)",
"description": "Born 330 Live 310",
"default_profile_image": false,
"profile_background_image_url": "http://a0.twimg.com/images/themes/theme1/bg.png",
"statuses_count": 579,
"friends_count": 110,
"following": null,
"show_all_inline_media": false,
"screen_name": "sean_cummings"
},
"in_reply_to_screen_name": null,
"source": "Twitter for Mac",
"in_reply_to_status_id": null
}
You can see that this data is perfect for MongoDB, you can easily write the data to there. I want to store this data on an SQL db like Oracle. I don't know how to store nested parts like :
"entities": {
"urls": [
],
"hashtags": [
{
"text": "freebandnames",
"indices": [
20,
34
]
}
],
"user_mentions": [
]
Can you tell me how I should write such properties on Oracle? Should I create a new table for each nested property(which I am unwilling to do) or is there another way? Is there a magical such that I can store all Tweet data in one place like it's done on NoSQL? Thanks.

IPython notebook export external svg to pdf

In a markdown cell in an ipython3 notebook (4.0.0) I include an svg that is located together with the notebook file:
<img src="NewTux.svg"/>
In the normal notebook view it is displayed as expected.
However, when I try to export to pdf the image does not show up.
What puzzles me is that a matplotlib plot (with %config InlineBackend.figure_format = 'svg') perfectly shows both on screen AND in the exported PDF.
How can I get a PDF including also the svgs which are not plotted but just included as a figure in markdown?
(A workaround is to print to pdf in browser, but then I miss LaTeX-formatting and formula and color in the syntax-highlighting of the code sections).
Minimum working example for the ipython file is:
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"%config InlineBackend.figure_format = 'svg'\n",
"import numpy as np\n",
"import matplotlib.pyplot as pp\n",
"\n",
"x = np.arange(0,10,0.05)\n",
"y = np.sin(x)\n",
"\n",
"pp.plot(x,y)\n",
"pp.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"NewTux.svg\">"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Where I downloaded: NewTux.svg from the Wikimedia commons.