Source code for kgeserver.dbpedia_dataset

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# coding:utf-8
#
# WikidataDataset Class: Creates a data set from Wikidata
# Copyright (C) 2016  Víctor Fernández Rico <vfrico@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import kgeserver
import kgeserver.dataset
import re
import math


[docs]class ESDBpediaDataset(kgeserver.dataset.Dataset):
[docs] def __init__(self, thread_limiter=2): """Creates WikidataDataset class The default endpoint is the original from wikidata. :param string new_endpoint: The URI of the SPARQL endpoint :param integer thread_limiter: The number of concurrent HTTP queries """ sparql_endpoint = "http://es.dbpedia.org/sparql?query=" super(ESDBpediaDataset, self).__init__(sparql_endpoint=sparql_endpoint, thread_limiter=thread_limiter) # Save all entities already explored by process_entity (saves time) self.entities_explored = {}
def check_entity(self, entity): # Example http://es.dbpedia.org/resource/Siemens_Velaro try: entity_uri = entity.split("/") dbpedia_id = entity_uri # Returns the entire URI # The last uri number should start with Q and has entity keyword # Number after Q must be a valid integer if entity_uri[-2] == 'resource' and\ entity_uri[2] == 'es.dbpedia.org': # print("Entity", entity, True) return entity else: # print("Entity", entity, False) return None except (ValueError, IndexError): # print("Entity", entity, False) return None # print("Entity", entity, False) return None def check_relation(self, relation): # Example http://es.dbpedia.org/property/vmax # http://es.dbpedia.org/ontology/wikiPageRedirects # http://xmlns.com/foaf/0.1/isPrimaryTopicOf # http://www.w3.org/2002/07/owl#sameAs # http://www.w3.org/2000/01/rdf-schema#label try: entity_uri = relation.split("/") dbpedia_id = entity_uri # The last uri number should start with Q and has entity keyword # Number after Q must be a valid integer if (entity_uri[-2] == 'property' and entity_uri[-1] != "wikiPageWikiLink") or\ (entity_uri[-2] == 'ontology' and entity_uri[-1] != "wikiPageWikiLink") or\ entity_uri[2] == 'xmlns.com' or\ entity_uri[2] == 'www.w3.org': # print("relation:", relation, True) return relation else: # print("relation1:", relation, False) return None except (ValueError, IndexError): # print("relation2:", relation, False) return None # print("relation3:", relation, False) return None
[docs] def load_from_graph_pattern(self, verbose=0, where="", **kwargs): """Auxiliar method that outputs a list of seed elements This seed vector will be the 'root nodes' of a tree with the desired depth on parent class (`load_dataset_recurrently`) :param verbose: The desired level of verbosity :param string where: SPARQL where to construct query :param int batch_size: The size of batches queried each time :return: A list of entities :rtype: list """ # Count all Wikidata elements count_query = """ PREFIX dcterms: <http://purl.org/dc/terms/> SELECT (count(*) as ?count) WHERE {{ {0} }}""".format(where) if verbose > 2: print("The count query is: \n", count_query) sts, count_json = self.execute_query(count_query) if verbose > 2: print(sts, count_json) # The number of elements entities_number = int(count_json[0]['count']['value']) if verbose > 0: print("Found {} entities".format(entities_number)) if 'batch_size' in kwargs: limit = kwargs['batch_size'] else: limit = 100000 seed_vector = [] rounds_number = math.ceil(entities_number / limit) print("Hay {} tripletas, se van a hacer {} consultas".format( entities_number, rounds_number)) if 'start_callback' in kwargs: kwargs['start_callback'](rounds_number) for q in range(0, rounds_number): offset = q * limit first_query = """ PREFIX dcterms: <http://purl.org/dc/terms/> SELECT ?subject ?object ?predicate WHERE {{ {2} }} LIMIT {0} OFFSET {1} """.format(limit, offset, where) if verbose > 2: print("The query is: \n", first_query) self.load_dataset_from_query(first_query) self.show() if 'callback' in kwargs: kwargs['callback']() # sts, first_json = self.execute_query(first_query) # if verbose > 2: # print(sts, len(first_json)) # seed_vector += [entity['subject']['value'] # for entity in first_json] return self.entities
[docs] def _process_entity(self, entity, verbose=0, graph_pattern=("{0} ?predicate ?object . ")): """Take entity and explore all relations and entities related to it This will execute the SPARQL query with the params passed to build a dataset with the *object* elements on triples retrieved from the server. :return: A list with new entities to be scanned """ # Check first if entity has been already explored if self.exist_element(self.check_entity(entity), self.entities_explored): return False wdt_entity = "<{0}>".format(entity) el_query = """SELECT ({1} as ?subject) ?predicate ?object WHERE {{ {0} }}""".format(graph_pattern.format(wdt_entity), wdt_entity) if verbose > 2: print("The element query is: \n", el_query) # Get all related elements sts, el_json = self.execute_query(el_query) if verbose > 2: print("HTTP", sts, len(el_json)) # Check future errors if sts is not 200: return False # Mark entity as already explored self.entities_explored[self.check_entity(entity)] = True # Entities to be explored next level to_queue = [] # For related elements, get all relations and objects for relation in el_json: try: object_uri = relation['object']['value'] # Add the subject scanned only if is valid obj = self.check_entity(object_uri) if obj: to_queue.append(object_uri) # Add triple will ensure every elements are valid self.add_triple(entity, relation['object']['value'], relation['predicate']['value']) except KeyError: print("Error on relation: {}".format(relation)) return False return to_queue