| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | #!/usr/bin/env python | 
					
						
							| 
									
										
										
										
											2021-10-03 15:12:09 +02:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2022-01-03 12:40:06 +01:00
										 |  |  | """Fetch units from :origin:`searx/engines/wikidata.py` engine.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data | 
					
						
							|  |  |  | ...  <.github/workflows/data-update.yml>`). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | import json | 
					
						
							|  |  |  | import collections | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # set path | 
					
						
							| 
									
										
										
										
											2021-02-25 17:42:52 +01:00
										 |  |  | from os.path import join | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | from searx import searx_dir | 
					
						
							| 
									
										
										
										
											2021-09-19 11:10:02 +02:00
										 |  |  | from searx.engines import wikidata, set_loggers | 
					
						
							| 
									
										
										
										
											2024-03-10 15:33:23 +01:00
										 |  |  | from searx.data import data_dir | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | DATA_FILE = data_dir / 'wikidata_units.json' | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-09-19 11:10:02 +02:00
										 |  |  | set_loggers(wikidata, 'wikidata') | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  | # the response contains duplicate ?item with the different ?symbol | 
					
						
							|  |  |  | # "ORDER BY ?item DESC(?rank) ?symbol" provides a deterministic result | 
					
						
							|  |  |  | # even if a ?item has different ?symbol of the same rank. | 
					
						
							| 
									
										
										
										
											2021-09-19 11:10:02 +02:00
										 |  |  | # A deterministic result | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  | # see: | 
					
						
							|  |  |  | # * https://www.wikidata.org/wiki/Help:Ranking | 
					
						
							|  |  |  | # * https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format ("Statement representation" section) | 
					
						
							|  |  |  | # * https://w.wiki/32BT | 
					
						
							| 
									
										
										
										
											2024-04-07 16:17:11 +02:00
										 |  |  | # * https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates#Quantities | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  | #   see the result for https://www.wikidata.org/wiki/Q11582 | 
					
						
							|  |  |  | #   there are multiple symbols the same rank | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | SARQL_REQUEST = """
 | 
					
						
							| 
									
										
										
										
											2024-04-07 16:17:11 +02:00
										 |  |  | SELECT DISTINCT ?item ?symbol ?tosi ?tosiUnit | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | WHERE | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  |   ?item wdt:P31/wdt:P279 wd:Q47574 . | 
					
						
							|  |  |  |   ?item p:P5061 ?symbolP . | 
					
						
							|  |  |  |   ?symbolP ps:P5061 ?symbol ; | 
					
						
							|  |  |  |            wikibase:rank ?rank . | 
					
						
							| 
									
										
										
										
											2024-04-07 16:17:11 +02:00
										 |  |  |   OPTIONAL { | 
					
						
							|  |  |  |     ?item p:P2370 ?tosistmt . | 
					
						
							|  |  |  |     ?tosistmt psv:P2370 ?tosinode . | 
					
						
							|  |  |  |     ?tosinode wikibase:quantityAmount ?tosi . | 
					
						
							|  |  |  |     ?tosinode wikibase:quantityUnit ?tosiUnit . | 
					
						
							|  |  |  |   } | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  |   FILTER(LANG(?symbol) = "en"). | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  | ORDER BY ?item DESC(?rank) ?symbol | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_data(): | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  |     results = collections.OrderedDict() | 
					
						
							| 
									
										
										
										
											2021-09-19 11:10:02 +02:00
										 |  |  |     response = wikidata.send_wikidata_query(SARQL_REQUEST) | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  |     for unit in response['results']['bindings']: | 
					
						
							| 
									
										
										
										
											2024-05-01 18:25:22 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-07 16:17:11 +02:00
										 |  |  |         symbol = unit['symbol']['value'] | 
					
						
							| 
									
										
										
										
											2024-05-01 18:25:22 +02:00
										 |  |  |         name = unit['item']['value'].rsplit('/', 1)[1] | 
					
						
							|  |  |  |         si_name = unit.get('tosiUnit', {}).get('value', '') | 
					
						
							|  |  |  |         if si_name: | 
					
						
							|  |  |  |             si_name = si_name.rsplit('/', 1)[1] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-07 16:17:11 +02:00
										 |  |  |         to_si_factor = unit.get('tosi', {}).get('value', '') | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  |         if name not in results: | 
					
						
							|  |  |  |             # ignore duplicate: always use the first one | 
					
						
							| 
									
										
										
										
											2024-04-07 16:17:11 +02:00
										 |  |  |             results[name] = { | 
					
						
							|  |  |  |                 'symbol': symbol, | 
					
						
							|  |  |  |                 'si_name': si_name if si_name else None, | 
					
						
							|  |  |  |                 'to_si_factor': float(to_si_factor) if to_si_factor else None, | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2021-02-23 13:10:38 +01:00
										 |  |  |     return results | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_wikidata_units_filename(): | 
					
						
							| 
									
										
										
										
											2024-03-10 15:33:23 +01:00
										 |  |  |     return join(join(searx_dir, "data"), "") | 
					
						
							| 
									
										
										
										
											2020-10-26 19:19:18 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-03 12:40:06 +01:00
										 |  |  | if __name__ == '__main__': | 
					
						
							| 
									
										
										
										
											2024-03-10 15:33:23 +01:00
										 |  |  |     with DATA_FILE.open('w', encoding="utf8") as f: | 
					
						
							|  |  |  |         json.dump(get_data(), f, indent=4, sort_keys=True, ensure_ascii=False) |