| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | # -*- coding: utf-8 -*- | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  |  Wikidata | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @website     https://wikidata.org | 
					
						
							|  |  |  |  @provide-api yes (https://wikidata.org/w/api.php) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @using-api   partially (most things require scraping) | 
					
						
							|  |  |  |  @results     JSON, HTML | 
					
						
							|  |  |  |  @stable      no (html can change) | 
					
						
							|  |  |  |  @parse       url, infobox | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2015-09-07 22:39:33 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | from searx import logger | 
					
						
							| 
									
										
										
										
											2015-01-21 11:33:16 +01:00
										 |  |  | from searx.poolrequests import get | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | from searx.engines.xpath import extract_text | 
					
						
							| 
									
										
										
										
											2016-12-15 07:34:43 +01:00
										 |  |  | from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url | 
					
						
							| 
									
										
										
										
											2016-11-30 18:43:03 +01:00
										 |  |  | from searx.url_utils import urlencode | 
					
						
							| 
									
										
										
										
											2019-11-15 09:31:37 +01:00
										 |  |  | from searx.utils import match_language, eval_xpath | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | from json import loads | 
					
						
							| 
									
										
										
										
											2016-06-04 05:39:41 +02:00
										 |  |  | from lxml.html import fromstring | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  | from lxml import etree | 
					
						
							| 
									
										
										
										
											2015-09-07 22:39:33 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | logger = logger.getChild('wikidata') | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | result_count = 1 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | # urls | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | wikidata_host = 'https://www.wikidata.org' | 
					
						
							| 
									
										
										
										
											2016-06-04 05:39:41 +02:00
										 |  |  | url_search = wikidata_host \ | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     + '/w/index.php?{query}&ns0=1' | 
					
						
							| 
									
										
										
										
											2016-06-04 05:39:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | wikidata_api = wikidata_host + '/w/api.php' | 
					
						
							|  |  |  | url_detail = wikidata_api\ | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     + '?action=parse&format=json&{query}'\ | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     + '&redirects=1&prop=text%7Cdisplaytitle%7Cparsewarnings'\ | 
					
						
							|  |  |  |     + '&disableeditsection=1&preview=1§ionpreview=1&disabletoc=1&utf8=1&formatversion=2' | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | url_map = 'https://www.openstreetmap.org/'\ | 
					
						
							|  |  |  |     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  | url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' | 
					
						
							| 
									
										
										
										
											2016-06-04 05:39:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | # xpaths | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  | div_ids_xpath = '//div[@id]' | 
					
						
							| 
									
										
										
										
											2018-04-09 04:17:00 +02:00
										 |  |  | wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href' | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | title_xpath = '//*[contains(@class,"wikibase-title-label")]' | 
					
						
							|  |  |  | description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' | 
					
						
							|  |  |  | label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' | 
					
						
							|  |  |  | url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' | 
					
						
							|  |  |  | wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ | 
					
						
							|  |  |  |     + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href' | 
					
						
							|  |  |  | property_row_xpath = './/div[contains(@class,"wikibase-statementview")]' | 
					
						
							|  |  |  | preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]' | 
					
						
							|  |  |  | value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ | 
					
						
							|  |  |  |     + '/*/div[contains(@class,"wikibase-snakview-value")]' | 
					
						
							|  |  |  | language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' | 
					
						
							|  |  |  | calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' | 
					
						
							| 
									
										
										
										
											2018-07-05 10:11:45 +02:00
										 |  |  | media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a' | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | def get_id_cache(result): | 
					
						
							|  |  |  |     id_cache = {} | 
					
						
							|  |  |  |     for e in eval_xpath(result, div_ids_xpath): | 
					
						
							|  |  |  |         id = e.get('id') | 
					
						
							|  |  |  |         if id.startswith('P'): | 
					
						
							|  |  |  |             id_cache[id] = e | 
					
						
							|  |  |  |     return id_cache | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-07-06 10:31:01 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | def request(query, params): | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |     params['url'] = url_search.format( | 
					
						
							| 
									
										
										
										
											2018-04-09 04:17:00 +02:00
										 |  |  |         query=urlencode({'search': query})) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def response(resp): | 
					
						
							|  |  |  |     results = [] | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     htmlparser = etree.HTMLParser() | 
					
						
							|  |  |  |     html = fromstring(resp.content.decode("utf-8"), parser=htmlparser) | 
					
						
							|  |  |  |     search_results = eval_xpath(html, wikidata_ids_xpath) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-01-06 15:27:46 +01:00
										 |  |  |     if resp.search_params['language'].split('-')[0] == 'all': | 
					
						
							|  |  |  |         language = 'en' | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0] | 
					
						
							| 
									
										
										
										
											2015-01-02 12:33:40 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # TODO: make requests asynchronous to avoid timeout when result_count > 1 | 
					
						
							| 
									
										
										
										
											2018-04-09 04:17:00 +02:00
										 |  |  |     for search_result in search_results[:result_count]: | 
					
						
							|  |  |  |         wikidata_id = search_result.split('/')[-1] | 
					
						
							| 
									
										
										
										
											2016-11-30 18:43:03 +01:00
										 |  |  |         url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |         htmlresponse = get(url) | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |         jsonresponse = loads(htmlresponse.content.decode("utf-8")) | 
					
						
							|  |  |  |         results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-28 16:53:30 +02:00
										 |  |  |     return results | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  | def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser): | 
					
						
							| 
									
										
										
										
											2014-10-11 15:49:50 +02:00
										 |  |  |     results = [] | 
					
						
							|  |  |  |     urls = [] | 
					
						
							|  |  |  |     attributes = [] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     title = jsonresponse.get('parse', {}).get('displaytitle', {}) | 
					
						
							|  |  |  |     result = jsonresponse.get('parse', {}).get('text', {}) | 
					
						
							| 
									
										
										
										
											2014-10-04 22:53:54 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     if not title or not result: | 
					
						
							| 
									
										
										
										
											2014-10-11 15:49:50 +02:00
										 |  |  |         return results | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     title = fromstring(title, parser=htmlparser) | 
					
						
							|  |  |  |     for elem in eval_xpath(title, language_fallback_xpath): | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |         elem.getparent().remove(elem) | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     title = extract_text(eval_xpath(title, title_xpath)) | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     result = fromstring(result, parser=htmlparser) | 
					
						
							|  |  |  |     for elem in eval_xpath(result, language_fallback_xpath): | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |         elem.getparent().remove(elem) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     description = extract_text(eval_xpath(result, description_xpath)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     id_cache = get_id_cache(result) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # URLS | 
					
						
							| 
									
										
										
										
											2014-10-12 14:33:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # official website | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_url(urls, result, id_cache, 'P856', results=results) | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # wikipedia | 
					
						
							|  |  |  |     wikipedia_link_count = 0 | 
					
						
							|  |  |  |     wikipedia_link = get_wikilink(result, language + 'wiki') | 
					
						
							|  |  |  |     if wikipedia_link: | 
					
						
							|  |  |  |         wikipedia_link_count += 1 | 
					
						
							|  |  |  |         urls.append({'title': 'Wikipedia (' + language + ')', | 
					
						
							|  |  |  |                      'url': wikipedia_link}) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if language != 'en': | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |         wikipedia_en_link = get_wikilink(result, 'enwiki') | 
					
						
							|  |  |  |         if wikipedia_en_link: | 
					
						
							|  |  |  |             wikipedia_link_count += 1 | 
					
						
							|  |  |  |             urls.append({'title': 'Wikipedia (en)', | 
					
						
							|  |  |  |                          'url': wikipedia_en_link}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # TODO: get_wiki_firstlanguage | 
					
						
							|  |  |  |     # if wikipedia_link_count == 0: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # more wikis | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # musicbrainz | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # IMDb | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # source code repository | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_url(urls, result, id_cache, 'P1324') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # blog | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_url(urls, result, id_cache, 'P1581') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # social media links | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/') | 
					
						
							|  |  |  |     add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     urls.append({'title': 'Wikidata', | 
					
						
							|  |  |  |                  'url': 'https://www.wikidata.org/wiki/' | 
					
						
							|  |  |  |                  + wikidata_id + '?uselang=' + language}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # INFOBOX ATTRIBUTES (ROWS) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |     # DATES | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # inception date | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P571', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # dissolution date | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P576', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # start date | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P580', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # end date | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P582', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # date of birth | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P569', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # date of death | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P570', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |     # date of spacecraft launch | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P619', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |     # date of spacecraft landing | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P620', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # nationality | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P27') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # country of origin | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P495') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # country | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P17') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # headquarters | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'Q180') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # PLACES | 
					
						
							|  |  |  |     # capital | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P36', trim=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # head of state | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P35', trim=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # head of government | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P6', trim=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # type of government | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P122') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # official language | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P37') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # population | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P1082', trim=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # area | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P2046') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # currency | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P38', trim=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # heigth (building) | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P2048') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # MEDIA | 
					
						
							|  |  |  |     # platform (videogames) | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P400') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # author | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P50') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # creator | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P170') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # director | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P57') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # performer | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P175') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # developer | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P178') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # producer | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P162') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # manufacturer | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P176') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # screenwriter | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P58') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # production company | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P272') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # record label | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P264') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # publisher | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P123') | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |     # original network | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P449') | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |     # distributor | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P750') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # composer | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P86') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # publication date | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P577', date=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # genre | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P136') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # original language | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P364') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # isbn | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'Q33057') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # software license | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P275') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # programming language | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P277') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # version | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P348', trim=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # narrative location | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P840') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # LANGUAGES | 
					
						
							|  |  |  |     # number of speakers | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P1098') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # writing system | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P282') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # regulatory body | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P1018') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # language code | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P218') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # OTHER | 
					
						
							|  |  |  |     # ceo | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P169', trim=True) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # founder | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P112') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # legal form (company/organization) | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P1454') | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |     # operator | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P137') | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |     # crew members (tripulation) | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P1029') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # taxon | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P225') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # chemical formula | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P274') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # winner (sports/contests) | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P1346') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # number of deaths | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P1120') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     # currency code | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     add_attribute(attributes, id_cache, 'P498') | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     image = add_image(id_cache) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |     if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: | 
					
						
							| 
									
										
										
										
											2014-10-11 15:49:50 +02:00
										 |  |  |         results.append({ | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |                        'url': urls[0]['url'], | 
					
						
							|  |  |  |                        'title': title, | 
					
						
							|  |  |  |                        'content': description | 
					
						
							|  |  |  |                        }) | 
					
						
							| 
									
										
										
										
											2014-10-11 15:49:50 +02:00
										 |  |  |     else: | 
					
						
							|  |  |  |         results.append({ | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |                        'infobox': title, | 
					
						
							| 
									
										
										
										
											2016-03-14 07:32:36 +01:00
										 |  |  |                        'id': wikipedia_link, | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |                        'content': description, | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |                        'img_src': image, | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |                        'attributes': attributes, | 
					
						
							|  |  |  |                        'urls': urls | 
					
						
							|  |  |  |                        }) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return results | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-28 16:53:30 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | # only returns first match | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  | def add_image(id_cache): | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |     # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon | 
					
						
							|  |  |  |     property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for property_id in property_ids: | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |         image = id_cache.get(property_id, None) | 
					
						
							|  |  |  |         if image is not None: | 
					
						
							|  |  |  |             image_name = eval_xpath(image, media_xpath) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |             image_src = url_image.replace('{filename}', extract_text(image_name[0])) | 
					
						
							|  |  |  |             return image_src | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # setting trim will only returned high ranked rows OR the first row | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  | def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False): | 
					
						
							|  |  |  |     attribute = id_cache.get(property_id, None) | 
					
						
							|  |  |  |     if attribute is not None: | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if default_label: | 
					
						
							|  |  |  |             label = default_label | 
					
						
							|  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |             label = extract_text(eval_xpath(attribute, label_xpath)) | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |             label = label[0].upper() + label[1:] | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if date: | 
					
						
							|  |  |  |             trim = True | 
					
						
							|  |  |  |             # remove calendar name | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |             calendar_name = eval_xpath(attribute, calendar_name_xpath) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |             for calendar in calendar_name: | 
					
						
							|  |  |  |                 calendar.getparent().remove(calendar) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         concat_values = "" | 
					
						
							|  |  |  |         values = [] | 
					
						
							|  |  |  |         first_value = None | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |         for row in eval_xpath(attribute, property_row_xpath): | 
					
						
							|  |  |  |             if not first_value or not trim or eval_xpath(row, preferred_rank_xpath): | 
					
						
							|  |  |  |                 value = eval_xpath(row, value_xpath) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |                 if not value: | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  |                 value = extract_text(value) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # save first value in case no ranked row is found | 
					
						
							|  |  |  |                 if trim and not first_value: | 
					
						
							|  |  |  |                     first_value = value | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     # to avoid duplicate values | 
					
						
							|  |  |  |                     if value not in values: | 
					
						
							|  |  |  |                         concat_values += value + ", " | 
					
						
							|  |  |  |                         values.append(value) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if trim and not values: | 
					
						
							|  |  |  |             attributes.append({'label': label, | 
					
						
							|  |  |  |                                'value': first_value}) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             attributes.append({'label': label, | 
					
						
							|  |  |  |                                'value': concat_values[:-2]}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # requires property_id unless it's a wiki link (defined in link_type) | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  | def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None, | 
					
						
							|  |  |  |             link_type=None): | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     links = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # wiki links don't have property in wikidata page | 
					
						
							|  |  |  |     if link_type and 'wiki' in link_type: | 
					
						
							|  |  |  |             links.append(get_wikilink(result, link_type)) | 
					
						
							| 
									
										
										
										
											2014-10-11 15:49:50 +02:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |         dom_element = id_cache.get(property_id, None) | 
					
						
							|  |  |  |         if dom_element is not None: | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |             if not default_label: | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |                 label = extract_text(eval_xpath(dom_element, label_xpath)) | 
					
						
							| 
									
										
										
										
											2016-06-28 06:35:43 +02:00
										 |  |  |                 label = label[0].upper() + label[1:] | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             if link_type == 'geo': | 
					
						
							|  |  |  |                 links.append(get_geolink(dom_element)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             elif link_type == 'imdb': | 
					
						
							|  |  |  |                 links.append(get_imdblink(dom_element, url_prefix)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             else: | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |                 url_results = eval_xpath(dom_element, url_xpath) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |                 for link in url_results: | 
					
						
							|  |  |  |                     if link is not None: | 
					
						
							|  |  |  |                         if url_prefix: | 
					
						
							|  |  |  |                             link = url_prefix + extract_text(link) | 
					
						
							|  |  |  |                         else: | 
					
						
							|  |  |  |                             link = extract_text(link) | 
					
						
							|  |  |  |                         links.append(link) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # append urls | 
					
						
							|  |  |  |     for url in links: | 
					
						
							|  |  |  |         if url is not None: | 
					
						
							|  |  |  |             urls.append({'title': default_label or label, | 
					
						
							|  |  |  |                          'url': url}) | 
					
						
							|  |  |  |             if results is not None: | 
					
						
							|  |  |  |                 results.append({'title': default_label or label, | 
					
						
							|  |  |  |                                 'url': url}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_imdblink(result, url_prefix): | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     imdb_id = eval_xpath(result, value_xpath) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     if imdb_id: | 
					
						
							|  |  |  |         imdb_id = extract_text(imdb_id) | 
					
						
							|  |  |  |         id_prefix = imdb_id[:2] | 
					
						
							|  |  |  |         if id_prefix == 'tt': | 
					
						
							|  |  |  |             url = url_prefix + 'title/' + imdb_id | 
					
						
							|  |  |  |         elif id_prefix == 'nm': | 
					
						
							|  |  |  |             url = url_prefix + 'name/' + imdb_id | 
					
						
							|  |  |  |         elif id_prefix == 'ch': | 
					
						
							|  |  |  |             url = url_prefix + 'character/' + imdb_id | 
					
						
							|  |  |  |         elif id_prefix == 'co': | 
					
						
							|  |  |  |             url = url_prefix + 'company/' + imdb_id | 
					
						
							|  |  |  |         elif id_prefix == 'ev': | 
					
						
							|  |  |  |             url = url_prefix + 'event/' + imdb_id | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             url = None | 
					
						
							|  |  |  |         return url | 
					
						
							| 
									
										
										
										
											2014-09-28 16:53:30 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  | def get_geolink(result): | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     coordinates = eval_xpath(result, value_xpath) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     if not coordinates: | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |         return None | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     coordinates = extract_text(coordinates[0]) | 
					
						
							|  |  |  |     latitude, longitude = coordinates.split(',') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # convert to decimal | 
					
						
							|  |  |  |     lat = int(latitude[:latitude.find(u'°')]) | 
					
						
							|  |  |  |     if latitude.find('\'') >= 0: | 
					
						
							|  |  |  |         lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0 | 
					
						
							|  |  |  |     if latitude.find('"') >= 0: | 
					
						
							|  |  |  |         lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 | 
					
						
							|  |  |  |     if latitude.find('S') >= 0: | 
					
						
							|  |  |  |         lat *= -1 | 
					
						
							|  |  |  |     lon = int(longitude[:longitude.find(u'°')]) | 
					
						
							|  |  |  |     if longitude.find('\'') >= 0: | 
					
						
							|  |  |  |         lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0 | 
					
						
							|  |  |  |     if longitude.find('"') >= 0: | 
					
						
							|  |  |  |         lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 | 
					
						
							|  |  |  |     if longitude.find('W') >= 0: | 
					
						
							|  |  |  |         lon *= -1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # TODO: get precision | 
					
						
							|  |  |  |     precision = 0.0002 | 
					
						
							| 
									
										
										
										
											2014-10-04 22:53:54 +02:00
										 |  |  |     # there is no zoom information, deduce from precision (error prone) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     # samples : | 
					
						
							|  |  |  |     # 13 --> 5 | 
					
						
							|  |  |  |     # 1 --> 6 | 
					
						
							|  |  |  |     # 0.016666666666667 --> 9 | 
					
						
							|  |  |  |     # 0.00027777777777778 --> 19 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |     # wolframalpha : | 
					
						
							|  |  |  |     # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}} | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |     # 14.1186-8.8322 x+0.625447 x^2 | 
					
						
							|  |  |  |     if precision < 0.0003: | 
					
						
							|  |  |  |         zoom = 19 | 
					
						
							|  |  |  |     else: | 
					
						
							| 
									
										
										
										
											2016-01-18 12:47:31 +01:00
										 |  |  |         zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |     url = url_map\ | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |         .replace('{latitude}', str(lat))\ | 
					
						
							|  |  |  |         .replace('{longitude}', str(lon))\ | 
					
						
							| 
									
										
										
										
											2014-12-07 16:36:20 +01:00
										 |  |  |         .replace('{zoom}', str(zoom)) | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return url | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-28 16:53:30 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  | def get_wikilink(result, wikiid): | 
					
						
							| 
									
										
										
										
											2019-07-25 07:59:54 +02:00
										 |  |  |     url = eval_xpath(result, wikilink_xpath.replace('{wikiid}', wikiid)) | 
					
						
							| 
									
										
										
										
											2016-06-06 08:08:36 +02:00
										 |  |  |     if not url: | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  |     url = url[0] | 
					
						
							|  |  |  |     if url.startswith('http://'): | 
					
						
							| 
									
										
										
										
											2014-09-28 16:51:41 +02:00
										 |  |  |         url = url.replace('http://', 'https://') | 
					
						
							|  |  |  |     elif url.startswith('//'): | 
					
						
							|  |  |  |         url = 'https:' + url | 
					
						
							|  |  |  |     return url |