DBPedia Person dataset - sparql

I need to extract Person's names with variation from DBPEdia.
My SPARQL request:
select distinct ?o where {
{ ?instance <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/0.1/Person>;
<http://xmlns.com/foaf/0.1/name> ?o }
union
{
?instance <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/0.1/Person>;
rdfs:label
}
FILTER (langMatches(lang(?o),"en"))}
DBPedia returns only 50 000 names via SRARQL request.
May be a dataset exists for Persons with all names variation?
Existing persons_en.nt dataset contains only foaf:name but I need other names' variations. Sometimes they listed in rdfs:label(e.g. for Maria Sharapova).

Found the answer in another post. Run the following SPARQL:
SELECT ?o WHERE {
{
select distinct ?o
where {
?instance <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/0.1/Person>;
rdfs:label ?o
FILTER (langMatches(lang(?o),"en"))
}
ORDER BY ASC(?o)
}
} OFFSET 800000 LIMIT 50000
in the Java programm:
Query query = QueryFactory.create(queryString);
QueryExecution qexec = QueryExecutionFactory.sparqlService("http://dbpedia.org/sparql", query);

Related

Find orphan nodes with SPARQL

I am trying to find orphan nodes (nodes which do not have any incoming relations) with SPARQL in a Fuseki database.
I tried several queries which all do not return correct results.
I tried the following:
Query 1 (got this from linkedIn)
select ?o ?isOrphan where { GRAPH <http://localhost:8080/catalog/-1305288727> {
?s ?p ?o .
FILTER(!isLiteral(?o))
bind(!(EXISTS {?o ?p1 ?o2}) as ?isOrphan)}}
Query 2
SELECT ?source ?s ?p ?o
WHERE { GRAPH <http://localhost:8080/catalog/-1305288727>{
?s ?p ?o .
FILTER EXISTS {?source ?p ?s } .
}
}
Query 3 - unbound variable pp in FILTER
SELECT ?source ?s ?p ?o
WHERE { GRAPH <http://localhost:8080/catalog/-1305288727>{
?s ?p ?o .
FILTER EXISTS {?source ?pp ?s } .
}
}
Any help is highly appreciated.
This query finds each entity that is the subject of any triple, and then checks that this entity is not the object of any triple.
SELECT ?orphan
FROM <http://localhost:8080/catalog/-1305288727>
WHERE {
?orphan ?p1 [] .
FILTER NOT EXISTS { ?linkingNode ?p2 ?orphan . }
}

SPARQL returning empty result when passing MIN(?date1) from subquery into outer query, with BIND((YEAR(?minDate) - YEAR(?date2)) AS ?diffDate)

<This question is now resolved, see comment by Valerio Cocchi>
I am trying to pass a variable from a subquery, that takes the minimum date of a set of dates ?date1 belonging to ?p and passes this to the outer query, which then takes another date ?date2 belonging to ?p (there can be at most 1 ?date2 for every ?p) and subtracts ?minDate from ?date2 to get an integer value for the number of years between. I am getting a blank value for this, i.e. ?diffDate returns no value.
I am using Fuseki version 4.3.2. Here is an example of the query:
SELECT ?p ?minDate ?date2 ?diffDate
{
?p a abc:P;
abc:hasAnotherDate ?date2.
BIND((YEAR(?minDate) - YEAR(?date2)) AS ?diffDate)
{
SELECT ?p (MIN(?date1) as ?minDate)
WHERE
{
?p a abc:P;
abc:hasDate ?date1.
} group by ?p
}
}
and an example of the kind of result I am getting:
|-?p----|-----------------?minDate-------------|-----------------?date2------------- |?diffDate|
|<123>|20012-11-22T00:00:00"^^xsd:dateTime|2008-08-18T00:00:00"^^xsd:dateTime| |
I would expect that ?diffDate would give me an integer value. Am I missing something fundamental about how subqueries work in SPARQL?
It seems you have encountered quite an obscure part of the SPARQL spec, namely how BIND works.
Normally SPARQL is evaluated without regard for the position of atoms, i.e.
SELECT *
WHERE {
?a :p1 ?b .
?b :p2 ?c .}
is the same query as:
SELECT *
WHERE {
?b :p2 ?c .
?a :p1 ?b .}
However, BIND is position dependent, so e.g.:
SELECT *
WHERE {
?a :p1 ?b .
BIND(:john AS ?a)}
is not a valid query, whereas:
SELECT *
WHERE {
BIND(:john AS ?a)
?a :p1 ?b .
}
is entirely valid. The same applies to variables used inside of the BIND, which must be declared before the BIND appears.
See here for more.
To go back to your problem, your BIND is using the ?minDate variable before it has been bound, which is why it fails to produce a value for ?diffDate.
This query should do the trick:
SELECT ?p ?minDate ?date2 ?diffDate
{
?p a abc:P;
abc:hasAnotherDate ?date2.
{
SELECT ?p (MIN(?date1) as ?minDate)
WHERE
{
?p a abc:P;
abc:hasDate ?date1.
} group by ?p
}
BIND((YEAR(?minDate) - YEAR(?date2)) AS ?diffDate) #Put the BIND after all the variables it uses are bound.
}
Alternatively, you could evaluate the difference in the SELECT, like so:
SELECT ?p ?minDate ?date2 (YEAR(?minDate) - YEAR(?date2) AS ?diffDate)
{
?p a abc:P;
abc:hasAnotherDate ?date2.
{
SELECT ?p (MIN(?date1) as ?minDate)
WHERE
{
?p a abc:P;
abc:hasDate ?date1.
} group by ?p
}
}

ُEXISTS and join with distinct values in two SPARQL queries return different results while they should do the same thing?

When running the following two queries on DBpedia the result is different.
First query gives 68 while the second gives 42. The only difference is the line
filter(exists {[] <http://dbpedia.org/ontology/nationality> ?o.})
replaced by join to ensure that the object of dbpo:country is in dbpo:nationality
{select distinct ?o { [] <http://dbpedia.org/ontology/nationality> ?o.}}
First Query:
select count(*){
{select distinct ?s ?o
{ ?o1 <http://dbpedia.org/ontology/successor> ?s .
?o1 <http://dbpedia.org/ontology/governor> ?o2 .
?o2 <http://dbpedia.org/ontology/country> ?o
filter(exists {[] <http://dbpedia.org/ontology/nationality> ?o.})
filter(exists {?s <http://dbpedia.org/ontology/nationality> []})
}}.
}
Second Query:
select count(*){
{select distinct ?s ?o
{ ?o1 <http://dbpedia.org/ontology/successor> ?s .
?o1 <http://dbpedia.org/ontology/governor> ?o2 .
?o2 <http://dbpedia.org/ontology/country> ?o
{select distinct ?o { [] <http://dbpedia.org/ontology/nationality> ?o.}}
filter(exists {?s <http://dbpedia.org/ontology/nationality> []})
}}.
}
The result of the first query seems to be the correct one.
You've got a DISTINCT in the subquery within the second full query, which is causing some results not to be carried through to the final result set.
Note the result of this query, which drops that keyword from the subquery, matches your first, i.e., 68 --
select count(*)
{ { select distinct ?s ?o
{ ?o1 <http://dbpedia.org/ontology/successor> ?s .
?o1 <http://dbpedia.org/ontology/governor> ?o2 .
?o2 <http://dbpedia.org/ontology/country> ?o
{ select ?o { [] <http://dbpedia.org/ontology/nationality> ?o. } }
filter ( exists { ?s <http://dbpedia.org/ontology/nationality> [] } )
} } }
I can't spare the time to investigate which result rows from the first and third queries are not found in the second, but I imagine that if you dig further into the descriptions of all these ?s and ?o, you will be able to find the answer.
A key hint — SPARQL queries are evaluated from inside-out (also described as from bottom-up, but this is confusing because it's not the literal bottom, but the lowest sub-query). That means that select ?o { [] <http://dbpedia.org/ontology/nationality> ?o. } (or select distinct ?o { [] <http://dbpedia.org/ontology/nationality> ?o. }) is evaluated before the rest of the query -- while the filter clauses are evaluated after the main select.

Number of triples of specific group instances?

I have found another problem in SPARQLing dbpedia. I am trying to get number of triples for specific group of class instances.
Number of triples of class Politician:
SELECT * WHERE {?s ?p ?o FILTER (?s = dbo:Politician OR ?o = dbo:Politician)}
But what about summary number of all triples for a specific group of politicians? For example number of triples of german politician. How is possible to get?
Thank you for your help!
revised answer
This will get the count of entities who are described as being Politicians from Germany —
SELECT COUNT(*)
{ ?s a dbo:Politician .
?s dbo:nationality dbr:Germany .
}
— and this will get the count of all records where those entities who are described as being Politicians from Germany appear as Subject —
SELECT COUNT(*)
{ ?s a dbo:Politician .
?s dbo:nationality dbr:Germany .
?s ?p ?o .
}
It is possible that you're looking for a bit more info, to include all records where the entities who are described as being Politicians from Germany appears as either Subject or Object (not just as Subject) —
SELECT COUNT(*)
{ { ?s a dbo:Politician .
?s dbo:nationality dbr:Germany .
?s ?p ?o .
}
UNION
{ ?o a dbo:Politician .
?o dbo:nationality dbr:Germany .
?s ?p ?o .
}
}
original answer
I think you are currently aiming for this, which counts all triples with dbo:Politician as either Subject or Object (which is currently 41105, without timeout), but note that this query doesn't count "entities which are politicians" which is (I think) what you're really after!
SELECT ( COUNT ( * ) AS ?NumberOfTriples )
WHERE
{ { dbo:Politician ?p ?o }
UNION
{ ?s ?p dbo:Politician }
}
If you want to count the number of "entities which are politicians" (i.e., rdf:type dbo:Politician) (currently 41078), you need a different query, like this --
SELECT ( COUNT ( DISTINCT ?s ) AS ?NumberOfPoliticians )
WHERE
{ ?s rdf:type dbo:Politician }
This should be clarified by a look at the { dbo:Politician ?p ?o } triples --
SELECT *
WHERE
{ dbo:Politician ?p ?o }

How the pass the output of one sparql query as a input to another sparql query

I am trying get the dbpedia movie link using the movie name in the first query and pass that link in the second query to get the movies similar to this movie.For e.g Lagaan.Now instead of passing the link manually in the second query is there a way to combine the two queries and pass the output of first query as an input to the second query.i.e:the link of the movie lagaan.Also,if the first query gives multiple links eg:if i am searching for Harry potter it will return multiple harry potter series links so,it should handle that case as well.
Query1
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix dbpedia-owl: <http://dbpedia.org/ontology/>
select distinct ?film where {
?film a dbpedia-owl:Film .
?film rdfs:label ?label .
filter regex( str(?label), "Lagaan", "i")
}
limit 10
Query 2
PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
select ?similar (count(?p) as ?similarity) where {
values ?movie { <http://dbpedia.org/resource/Lagaan> }
?similar ?p ?o ; a dbpedia-owl:Film .
?movie ?p ?o .
}
group by ?similar ?movie
having count(?p) > 35
order by desc(?similarity)
Edited query:
select ?film ?similar (count(?p) as ?similarity) where {
{
select distinct ?film where {
?film a dbpedia-owl:Film .
?film rdfs:label ?label .
filter regex( str(?label), "Lagaan", "i")
}
}
?similar ?p ?o ; a dbpedia-owl:Film .
?film ?p ?o .
}
group by ?similar ?film
having count(?p) > 35
order by desc(?similarity)
corrected query as told by Joshua Taylor
select ?film ?other (count(*) as ?similarity) {
{
select ?film where {
?film a dbpedia-owl:Film ; rdfs:label ?label .
filter contains(lcase(?label),"lagaan")
}
limit 1
}
?film ?p ?o .
?other a dbpedia-owl:Film ; ?p ?o .
}
group by ?film ?other
having count(?p) > 25
order by desc(?similarity)
is there a way to combine the two queries and pass the output of first
query as an input to the second query.
SPARQL 1.1 defines subqueries. The results of inner queries are available to outer queries, so they are "passed" to them. In your case, you would have something along the lines of:
select ?similarMovie (... as ?similarity) where {
{ #-- QUERY 1, find one or more films
select distinct ?film where {
#-- ...
}
}
#-- QUERY 2, find films similar to ?film
#-- ...
}