| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | #!/usr/bin/env python | 
					
						
							| 
									
										
										
										
											2021-10-03 15:12:09 +02:00
										 |  |  | # SPDX-License-Identifier: AGPL-3.0-or-later | 
					
						
							| 
									
										
										
										
											2022-01-03 12:40:06 +01:00
										 |  |  | """Update :origin:`searx/data/external_bangs.json` using the duckduckgo bangs
 | 
					
						
							| 
									
										
										
										
											2024-03-10 11:49:37 +01:00
										 |  |  | from :py:obj:`BANGS_URL`. | 
					
						
							| 
									
										
										
										
											2022-01-03 12:40:06 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-10 11:49:37 +01:00
										 |  |  | - :origin:`CI Update data ... <.github/workflows/data-update.yml>` | 
					
						
							| 
									
										
										
										
											2022-01-03 12:40:06 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import json | 
					
						
							| 
									
										
										
										
											2021-03-18 19:59:01 +01:00
										 |  |  | import httpx | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 18:08:48 +01:00
										 |  |  | from searx.external_bang import LEAF_KEY | 
					
						
							| 
									
										
										
										
											2024-03-10 15:33:23 +01:00
										 |  |  | from searx.data import data_dir | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-10 15:33:23 +01:00
										 |  |  | DATA_FILE = data_dir / 'external_bangs.json' | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-10 11:49:37 +01:00
										 |  |  | BANGS_URL = 'https://duckduckgo.com/bang.js' | 
					
						
							|  |  |  | """JSON file which contains the bangs.""" | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-10 11:49:37 +01:00
										 |  |  | HTTPS_COLON = 'https:' | 
					
						
							|  |  |  | HTTP_COLON = 'http:' | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-10 11:49:37 +01:00
										 |  |  | def main(): | 
					
						
							|  |  |  |     print(f'fetch bangs from {BANGS_URL}') | 
					
						
							|  |  |  |     response = httpx.get(BANGS_URL) | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  |     response.raise_for_status() | 
					
						
							| 
									
										
										
										
											2024-03-10 11:49:37 +01:00
										 |  |  |     ddg_bangs = json.loads(response.content.decode()) | 
					
						
							|  |  |  |     trie = parse_ddg_bangs(ddg_bangs) | 
					
						
							|  |  |  |     output = { | 
					
						
							|  |  |  |         'version': 0, | 
					
						
							|  |  |  |         'trie': trie, | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-03-10 15:33:23 +01:00
										 |  |  |     with DATA_FILE.open('w', encoding="utf8") as f: | 
					
						
							|  |  |  |         json.dump(output, f, indent=4, sort_keys=True, ensure_ascii=False) | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def merge_when_no_leaf(node): | 
					
						
							|  |  |  |     """Minimize the number of nodes
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 18:08:48 +01:00
										 |  |  |     ``A -> B -> C`` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     - ``B`` is child of ``A`` | 
					
						
							|  |  |  |     - ``C`` is child of ``B`` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     If there are no ``C`` equals to ``<LEAF_KEY>``, then each ``C`` are merged | 
					
						
							|  |  |  |     into ``A``.  For example (5 nodes):: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       d -> d -> g -> <LEAF_KEY> (ddg) | 
					
						
							|  |  |  |         -> i -> g -> <LEAF_KEY> (dig) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-15 09:53:03 +02:00
										 |  |  |     becomes (3 nodes):: | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 18:08:48 +01:00
										 |  |  |       d -> dg -> <LEAF_KEY> | 
					
						
							|  |  |  |         -> ig -> <LEAF_KEY> | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     restart = False | 
					
						
							|  |  |  |     if not isinstance(node, dict): | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # create a copy of the keys so node can be modified | 
					
						
							|  |  |  |     keys = list(node.keys()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for key in keys: | 
					
						
							| 
									
										
										
										
											2022-01-12 18:08:48 +01:00
										 |  |  |         if key == LEAF_KEY: | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         value = node[key] | 
					
						
							|  |  |  |         value_keys = list(value.keys()) | 
					
						
							| 
									
										
										
										
											2022-01-12 18:08:48 +01:00
										 |  |  |         if LEAF_KEY not in value_keys: | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  |             for value_key in value_keys: | 
					
						
							|  |  |  |                 node[key + value_key] = value[value_key] | 
					
						
							|  |  |  |                 merge_when_no_leaf(node[key + value_key]) | 
					
						
							|  |  |  |             del node[key] | 
					
						
							|  |  |  |             restart = True | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             merge_when_no_leaf(value) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if restart: | 
					
						
							|  |  |  |         merge_when_no_leaf(node) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def optimize_leaf(parent, parent_key, node): | 
					
						
							|  |  |  |     if not isinstance(node, dict): | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-01-12 18:08:48 +01:00
										 |  |  |     if len(node) == 1 and LEAF_KEY in node and parent is not None: | 
					
						
							|  |  |  |         parent[parent_key] = node[LEAF_KEY] | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  |     else: | 
					
						
							|  |  |  |         for key, value in node.items(): | 
					
						
							|  |  |  |             optimize_leaf(node, key, value) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def parse_ddg_bangs(ddg_bangs): | 
					
						
							|  |  |  |     bang_trie = {} | 
					
						
							|  |  |  |     bang_urls = {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for bang_definition in ddg_bangs: | 
					
						
							|  |  |  |         # bang_list | 
					
						
							|  |  |  |         bang_url = bang_definition['u'] | 
					
						
							|  |  |  |         if '{{{s}}}' not in bang_url: | 
					
						
							|  |  |  |             # ignore invalid bang | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         bang_url = bang_url.replace('{{{s}}}', chr(2)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # only for the https protocol: "https://example.com" becomes "//example.com" | 
					
						
							|  |  |  |         if bang_url.startswith(HTTPS_COLON + '//'): | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |             bang_url = bang_url[len(HTTPS_COLON) :] | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |         if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON) :] in bang_urls: | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  |             # if the bang_url uses the http:// protocol, and the same URL exists in https:// | 
					
						
							|  |  |  |             # then reuse the https:// bang definition. (written //example.com) | 
					
						
							| 
									
										
										
										
											2021-12-27 09:26:22 +01:00
										 |  |  |             bang_def_output = bang_urls[bang_url[len(HTTP_COLON) :]] | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  |         else: | 
					
						
							|  |  |  |             # normal use case : new http:// URL or https:// URL (without "https:", see above) | 
					
						
							|  |  |  |             bang_rank = str(bang_definition['r']) | 
					
						
							|  |  |  |             bang_def_output = bang_url + chr(1) + bang_rank | 
					
						
							|  |  |  |             bang_def_output = bang_urls.setdefault(bang_url, bang_def_output) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         bang_urls[bang_url] = bang_def_output | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # bang name | 
					
						
							|  |  |  |         bang = bang_definition['t'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # bang_trie | 
					
						
							|  |  |  |         t = bang_trie | 
					
						
							|  |  |  |         for bang_letter in bang: | 
					
						
							|  |  |  |             t = t.setdefault(bang_letter, {}) | 
					
						
							| 
									
										
										
										
											2022-01-12 18:08:48 +01:00
										 |  |  |         t = t.setdefault(LEAF_KEY, bang_def_output) | 
					
						
							| 
									
										
										
										
											2021-02-22 18:03:24 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # optimize the trie | 
					
						
							|  |  |  |     merge_when_no_leaf(bang_trie) | 
					
						
							|  |  |  |     optimize_leaf(None, None, bang_trie) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return bang_trie | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == '__main__': | 
					
						
							| 
									
										
										
										
											2024-03-10 11:49:37 +01:00
										 |  |  |     main() |