[json_engine] mirror xpath functionality
This commit is contained in:
		
							parent
							
								
									e115aa048e
								
							
						
					
					
						commit
						f51008c414
					
				| @ -8,6 +8,8 @@ Configuration | ||||
| Request: | ||||
| 
 | ||||
| - :py:obj:`search_url` | ||||
| - :py:obj:`lang_all` | ||||
| - :py:obj:`soft_max_redirects` | ||||
| - :py:obj:`method` | ||||
| - :py:obj:`request_body` | ||||
| - :py:obj:`cookies` | ||||
| @ -19,10 +21,22 @@ Paging: | ||||
| - :py:obj:`page_size` | ||||
| - :py:obj:`first_page_num` | ||||
| 
 | ||||
| Time Range: | ||||
| 
 | ||||
| - :py:obj:`time_range_support` | ||||
| - :py:obj:`time_range_url` | ||||
| - :py:obj:`time_range_map` | ||||
| 
 | ||||
| Safe-Search: | ||||
| 
 | ||||
| - :py:obj:`safe_search_support` | ||||
| - :py:obj:`safe_search_map` | ||||
| 
 | ||||
| Response: | ||||
| 
 | ||||
| - :py:obj:`title_html_to_text` | ||||
| - :py:obj:`content_html_to_text` | ||||
| - :py:obj:`no_result_for_http_status` | ||||
| 
 | ||||
| JSON query: | ||||
| 
 | ||||
| @ -31,6 +45,8 @@ JSON query: | ||||
| - :py:obj:`url_prefix` | ||||
| - :py:obj:`title_query` | ||||
| - :py:obj:`content_query` | ||||
| - :py:obj:`thumbnail_query` | ||||
| - :py:obj:`thumbnail_prefix` | ||||
| - :py:obj:`suggestion_query` | ||||
| 
 | ||||
| 
 | ||||
| @ -61,12 +77,13 @@ from collections.abc import Iterable | ||||
| from json import loads | ||||
| from urllib.parse import urlencode | ||||
| from searx.utils import to_string, html_to_text | ||||
| from searx.network import raise_for_httperror | ||||
| 
 | ||||
| search_url = None | ||||
| """ | ||||
| Search URL of the engine.  Example:: | ||||
| 
 | ||||
|     https://example.org/?search={query}&page={pageno} | ||||
|     https://example.org/?search={query}&page={pageno}{time_range}{safe_search} | ||||
| 
 | ||||
| Replacements are: | ||||
| 
 | ||||
| @ -76,8 +93,41 @@ Replacements are: | ||||
| ``{pageno}``: | ||||
|   Page number if engine supports paging :py:obj:`paging` | ||||
| 
 | ||||
| ``{lang}``: | ||||
|   ISO 639-1 language code (en, de, fr ..) | ||||
| 
 | ||||
| ``{time_range}``: | ||||
|   :py:obj:`URL parameter <time_range_url>` if engine :py:obj:`supports time | ||||
|   range <time_range_support>`.  The value for the parameter is taken from | ||||
|   :py:obj:`time_range_map`. | ||||
| 
 | ||||
| ``{safe_search}``: | ||||
|   Safe-search :py:obj:`URL parameter <safe_search_map>` if engine | ||||
|   :py:obj:`supports safe-search <safe_search_support>`.  The ``{safe_search}`` | ||||
|   replacement is taken from the :py:obj:`safes_search_map`.  Filter results:: | ||||
| 
 | ||||
|       0: none, 1: moderate, 2:strict | ||||
| 
 | ||||
|   If not supported, the URL parameter is an empty string. | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| lang_all = 'en' | ||||
| '''Replacement ``{lang}`` in :py:obj:`search_url` if language ``all`` is | ||||
| selected. | ||||
| ''' | ||||
| 
 | ||||
| no_result_for_http_status = [] | ||||
| '''Return empty result for these HTTP status codes instead of throwing an error. | ||||
| 
 | ||||
| .. code:: yaml | ||||
| 
 | ||||
|     no_result_for_http_status: [] | ||||
| ''' | ||||
| 
 | ||||
| soft_max_redirects = 0 | ||||
| '''Maximum redirects, soft limit. Record an error but don't stop the engine''' | ||||
| 
 | ||||
| method = 'GET' | ||||
| '''Some engines might require to do POST requests for search.''' | ||||
| 
 | ||||
| @ -140,6 +190,12 @@ title_query = None | ||||
| content_query = None | ||||
| '''JSON query of result's ``content``. For the query string documentation see :py:obj:`results_query`''' | ||||
| 
 | ||||
| thumbnail_query = False | ||||
| '''JSON query of result's ``thumbnail``. For the query string documentation see :py:obj:`results_query`''' | ||||
| 
 | ||||
| thumbnail_prefix = '' | ||||
| '''String to prepend to the result's ``thumbnail``.''' | ||||
| 
 | ||||
| suggestion_query = '' | ||||
| '''JSON query of result's ``suggestion``. For the query string documentation see :py:obj:`results_query`''' | ||||
| 
 | ||||
| @ -149,6 +205,53 @@ title_html_to_text = False | ||||
| content_html_to_text = False | ||||
| '''Extract text from a HTML content string''' | ||||
| 
 | ||||
| time_range_support = False | ||||
| '''Engine supports search time range.''' | ||||
| 
 | ||||
| time_range_url = '&hours={time_range_val}' | ||||
| '''Time range URL parameter in the in :py:obj:`search_url`.  If no time range is | ||||
| requested by the user, the URL parameter is an empty string.  The | ||||
| ``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`. | ||||
| 
 | ||||
| .. code:: yaml | ||||
| 
 | ||||
|     time_range_url : '&days={time_range_val}' | ||||
| ''' | ||||
| 
 | ||||
| time_range_map = { | ||||
|     'day': 24, | ||||
|     'week': 24 * 7, | ||||
|     'month': 24 * 30, | ||||
|     'year': 24 * 365, | ||||
| } | ||||
| '''Maps time range value from user to ``{time_range_val}`` in | ||||
| :py:obj:`time_range_url`. | ||||
| 
 | ||||
| .. code:: yaml | ||||
| 
 | ||||
|     time_range_map: | ||||
|       day: 1 | ||||
|       week: 7 | ||||
|       month: 30 | ||||
|       year: 365 | ||||
| ''' | ||||
| 
 | ||||
| safe_search_support = False | ||||
| '''Engine supports safe-search.''' | ||||
| 
 | ||||
| safe_search_map = {0: '&filter=none', 1: '&filter=moderate', 2: '&filter=strict'} | ||||
| '''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`. | ||||
| 
 | ||||
| .. code:: yaml | ||||
| 
 | ||||
|     safesearch: true | ||||
|     safes_search_map: | ||||
|       0: '&filter=none' | ||||
|       1: '&filter=moderate' | ||||
|       2: '&filter=strict' | ||||
| 
 | ||||
| ''' | ||||
| 
 | ||||
| 
 | ||||
| def iterate(iterable): | ||||
|     if isinstance(iterable, dict): | ||||
| @ -207,10 +310,26 @@ def query(data, query_string): | ||||
| 
 | ||||
| def request(query, params):  # pylint: disable=redefined-outer-name | ||||
|     '''Build request parameters (see :ref:`engine request`).''' | ||||
|     fp = {'query': urlencode({'q': query})[2:]}  # pylint: disable=invalid-name | ||||
|     lang = lang_all | ||||
|     if params['language'] != 'all': | ||||
|         lang = params['language'][:2] | ||||
| 
 | ||||
|     if paging and search_url.find('{pageno}') >= 0: | ||||
|         fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num | ||||
|     time_range = '' | ||||
|     if params.get('time_range'): | ||||
|         time_range_val = time_range_map.get(params.get('time_range')) | ||||
|         time_range = time_range_url.format(time_range_val=time_range_val) | ||||
| 
 | ||||
|     safe_search = '' | ||||
|     if params['safesearch']: | ||||
|         safe_search = safe_search_map[params['safesearch']] | ||||
| 
 | ||||
|     fp = {  # pylint: disable=invalid-name | ||||
|         'query': urlencode({'q': query})[2:], | ||||
|         'lang': lang, | ||||
|         'pageno': (params['pageno'] - 1) * page_size + first_page_num, | ||||
|         'time_range': time_range, | ||||
|         'safe_search': safe_search, | ||||
|     } | ||||
| 
 | ||||
|     params['cookies'].update(cookies) | ||||
|     params['headers'].update(headers) | ||||
| @ -223,6 +342,9 @@ def request(query, params):  # pylint: disable=redefined-outer-name | ||||
|         fp['query'] = query | ||||
|         params['data'] = request_body.format(**fp) | ||||
| 
 | ||||
|     params['soft_max_redirects'] = soft_max_redirects | ||||
|     params['raise_for_httperror'] = False | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| @ -234,10 +356,16 @@ def response(resp): | ||||
|     '''Scrap *results* from the response (see :ref:`engine results`).''' | ||||
|     results = [] | ||||
| 
 | ||||
|     if no_result_for_http_status and resp.status_code in no_result_for_http_status: | ||||
|         return results | ||||
| 
 | ||||
|     raise_for_httperror(resp) | ||||
| 
 | ||||
|     if not resp.text: | ||||
|         return results | ||||
| 
 | ||||
|     json = loads(resp.text) | ||||
|     is_onion = 'onions' in categories | ||||
| 
 | ||||
|     title_filter = html_to_text if title_html_to_text else identity | ||||
|     content_filter = html_to_text if content_html_to_text else identity | ||||
| @ -256,13 +384,24 @@ def response(resp): | ||||
|                 content = query(result, content_query)[0] | ||||
|             except:  # pylint: disable=bare-except | ||||
|                 content = "" | ||||
|             results.append( | ||||
|                 { | ||||
| 
 | ||||
|             tmp_result = { | ||||
|                 'url': url_prefix + to_string(url), | ||||
|                 'title': title_filter(to_string(title)), | ||||
|                 'content': content_filter(to_string(content)), | ||||
|             } | ||||
|             ) | ||||
| 
 | ||||
|             if thumbnail_query: | ||||
|                 try: | ||||
|                     thumbnail_query_result = query(result, thumbnail_query)[0] | ||||
|                     tmp_result['thumbnail'] = thumbnail_prefix + to_string(thumbnail_query_result) | ||||
|                 except:  # pylint: disable=bare-except | ||||
|                     continue | ||||
| 
 | ||||
|             if is_onion: | ||||
|                 tmp_result['is_onion'] = True | ||||
| 
 | ||||
|             results.append(tmp_result) | ||||
|     else: | ||||
|         for result in json: | ||||
|             url = query(result, url_query)[0] | ||||
| @ -274,6 +413,7 @@ def response(resp): | ||||
|                     'url': url_prefix + to_string(url), | ||||
|                     'title': title_filter(to_string(title)), | ||||
|                     'content': content_filter(to_string(content)), | ||||
|                     'is_onion': is_onion, | ||||
|                 } | ||||
|             ) | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user