Merge pull request #204 from return42/fix-qwant
[fix] Qwant engine - implement API v3 and add engine "qwant videos"
This commit is contained in:
		
						commit
						7efb527719
					
				| @ -25607,6 +25607,49 @@ | |||||||
|     "zh-CN", |     "zh-CN", | ||||||
|     "zh-HK" |     "zh-HK" | ||||||
|   ], |   ], | ||||||
|  |   "qwant videos": [ | ||||||
|  |     "bg-BG", | ||||||
|  |     "ca-ES", | ||||||
|  |     "cs-CZ", | ||||||
|  |     "da-DK", | ||||||
|  |     "de-AT", | ||||||
|  |     "de-CH", | ||||||
|  |     "de-DE", | ||||||
|  |     "el-GR", | ||||||
|  |     "en-AU", | ||||||
|  |     "en-CA", | ||||||
|  |     "en-GB", | ||||||
|  |     "en-IE", | ||||||
|  |     "en-IN", | ||||||
|  |     "en-MY", | ||||||
|  |     "en-NZ", | ||||||
|  |     "en-US", | ||||||
|  |     "es-AR", | ||||||
|  |     "es-CL", | ||||||
|  |     "es-ES", | ||||||
|  |     "es-MX", | ||||||
|  |     "et-EE", | ||||||
|  |     "fi-FI", | ||||||
|  |     "fr-BE", | ||||||
|  |     "fr-CA", | ||||||
|  |     "fr-CH", | ||||||
|  |     "fr-FR", | ||||||
|  |     "hu-HU", | ||||||
|  |     "it-CH", | ||||||
|  |     "it-IT", | ||||||
|  |     "ko-KR", | ||||||
|  |     "nb-NO", | ||||||
|  |     "nl-BE", | ||||||
|  |     "nl-NL", | ||||||
|  |     "pl-PL", | ||||||
|  |     "pt-BR", | ||||||
|  |     "pt-PT", | ||||||
|  |     "ro-RO", | ||||||
|  |     "sv-SE", | ||||||
|  |     "th-TH", | ||||||
|  |     "zh-CN", | ||||||
|  |     "zh-HK" | ||||||
|  |   ], | ||||||
|   "startpage": { |   "startpage": { | ||||||
|     "af": { |     "af": { | ||||||
|       "alias": "afrikaans" |       "alias": "afrikaans" | ||||||
|  | |||||||
| @ -1,15 +1,42 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """ | # lint: pylint | ||||||
|  Qwant (Web, Images, News, Social) | """Qwant (Web, News, Images, Videos) | ||||||
|  | 
 | ||||||
|  | This engine uses the Qwant API (https://api.qwant.com/v3). The API is | ||||||
|  | undocumented but can be reverse engineered by reading the network log of | ||||||
|  | https://www.qwant.com/ queries. | ||||||
|  | 
 | ||||||
|  | This implementation is used by different qwant engines in the settings.yml:: | ||||||
|  | 
 | ||||||
|  |   - name: qwant | ||||||
|  |     categories: general | ||||||
|  |     ... | ||||||
|  |   - name: qwant news | ||||||
|  |     categories: news | ||||||
|  |     ... | ||||||
|  |   - name: qwant images | ||||||
|  |     categories: images | ||||||
|  |     ... | ||||||
|  |   - name: qwant videos | ||||||
|  |     categories: videos | ||||||
|  |     ... | ||||||
|  | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from datetime import datetime | from datetime import ( | ||||||
|  |     datetime, | ||||||
|  |     timedelta, | ||||||
|  | ) | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.utils import html_to_text, match_language | 
 | ||||||
| from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException | # from searx import logger | ||||||
|  | from searx.utils import match_language | ||||||
|  | from searx.exceptions import SearxEngineAPIException | ||||||
| from searx.network import raise_for_httperror | from searx.network import raise_for_httperror | ||||||
| 
 | 
 | ||||||
|  | #logger = logger.getChild('qwant') | ||||||
|  | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
|     "website": 'https://www.qwant.com/', |     "website": 'https://www.qwant.com/', | ||||||
| @ -25,98 +52,148 @@ categories = [] | |||||||
| paging = True | paging = True | ||||||
| supported_languages_url = about['website'] | supported_languages_url = about['website'] | ||||||
| 
 | 
 | ||||||
| category_to_keyword = {'general': 'web', | category_to_keyword = { | ||||||
|                        'images': 'images', |     'general': 'web', | ||||||
|                        'news': 'news'} |     'news': 'news', | ||||||
|  |     'images': 'images', | ||||||
|  |     'videos': 'videos', | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| # search-url | # search-url | ||||||
| url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4' | url = 'https://api.qwant.com/v3/search/{keyword}?q={query}&count={count}&offset={offset}' | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     offset = (params['pageno'] - 1) * 10 |     """Qwant search request""" | ||||||
|  |     keyword = category_to_keyword[categories[0]] | ||||||
|  |     count = 10  # web: count must be equal to 10 | ||||||
| 
 | 
 | ||||||
|     if categories[0] and categories[0] in category_to_keyword: |     if keyword == 'images': | ||||||
| 
 |         count = 50 | ||||||
|         params['url'] = url.format(keyword=category_to_keyword[categories[0]], |         offset = (params['pageno'] - 1) * count | ||||||
|                                    query=urlencode({'q': query}), |         # count + offset must be lower than 250 | ||||||
|                                    offset=offset) |         offset = min(offset, 199) | ||||||
|     else: |     else: | ||||||
|         params['url'] = url.format(keyword='web', |         offset = (params['pageno'] - 1) * count | ||||||
|                                    query=urlencode({'q': query}), |         # count + offset must be lower than 50 | ||||||
|                                    offset=offset) |         offset = min(offset, 40) | ||||||
|  | 
 | ||||||
|  |     params['url'] = url.format( | ||||||
|  |         keyword = keyword, | ||||||
|  |         query = urlencode({'q': query}), | ||||||
|  |         offset = offset, | ||||||
|  |         count = count, | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
|     # add language tag |     # add language tag | ||||||
|     if params['language'] != 'all': |     if params['language'] != 'all': | ||||||
|         language = match_language(params['language'], supported_languages, language_aliases) |         language = match_language( | ||||||
|         params['url'] += '&locale=' + language.replace('-', '_').lower() |             params['language'], | ||||||
|  |             # pylint: disable=undefined-variable | ||||||
|  |             supported_languages, | ||||||
|  |             language_aliases, | ||||||
|  |         ) | ||||||
|  |         params['url'] += '&locale=' + language.replace('-', '_') | ||||||
| 
 | 
 | ||||||
|     params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' |  | ||||||
|     params['raise_for_httperror'] = False |     params['raise_for_httperror'] = False | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|  |     """Get response from Qwant's search request""" | ||||||
|  | 
 | ||||||
|  |     keyword = category_to_keyword[categories[0]] | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     # According to https://www.qwant.com/js/app.js |     # load JSON result | ||||||
|     if resp.status_code == 429: |     search_results = loads(resp.text) | ||||||
|         raise SearxEngineCaptchaException() |     data = search_results.get('data', {}) | ||||||
|  | 
 | ||||||
|  |     # check for an API error | ||||||
|  |     if search_results.get('status') != 'success': | ||||||
|  |         msg = ",".join(data.get('message', ['unknown', ])) | ||||||
|  |         raise SearxEngineAPIException('API error::' + msg) | ||||||
| 
 | 
 | ||||||
|     # raise for other errors |     # raise for other errors | ||||||
|     raise_for_httperror(resp) |     raise_for_httperror(resp) | ||||||
| 
 | 
 | ||||||
|     # load JSON result |     if keyword == 'web': | ||||||
|     search_results = loads(resp.text) |         # The WEB query contains a list named 'mainline'.  This list can contain | ||||||
| 
 |         # different result types (e.g. mainline[0]['type'] returns type of the | ||||||
|     # check for an API error |         # result items in mainline[0]['items'] | ||||||
|     if search_results.get('status') != 'success': |         mainline = data.get('result', {}).get('items', {}).get('mainline', {}) | ||||||
|         raise SearxEngineAPIException('API error ' + str(search_results.get('error', ''))) |     else: | ||||||
|  |         # Queries on News, Images and Videos do not have a list named 'mainline' | ||||||
|  |         # in the response.  The result items are directly in the list | ||||||
|  |         # result['items']. | ||||||
|  |         mainline = data.get('result', {}).get('items', []) | ||||||
|  |         mainline = [ | ||||||
|  |             {'type' : keyword, 'items' : mainline }, | ||||||
|  |         ] | ||||||
| 
 | 
 | ||||||
|     # return empty array if there are no results |     # return empty array if there are no results | ||||||
|     if 'data' not in search_results: |     if not mainline: | ||||||
|         return [] |         return [] | ||||||
| 
 | 
 | ||||||
|     data = search_results.get('data', {}) |     for row in mainline: | ||||||
| 
 | 
 | ||||||
|     res = data.get('result', {}) |         mainline_type = row.get('type', 'web') | ||||||
|  |         if mainline_type == 'ads': | ||||||
|  |             # ignore adds | ||||||
|  |             continue | ||||||
| 
 | 
 | ||||||
|     # parse results |         mainline_items = row.get('items', []) | ||||||
|     for result in res.get('items', {}): |         for item in mainline_items: | ||||||
| 
 | 
 | ||||||
|         title = html_to_text(result['title']) |             title = item['title'] | ||||||
|         res_url = result['url'] |             res_url = item['url'] | ||||||
|         content = html_to_text(result['desc']) |  | ||||||
| 
 | 
 | ||||||
|         if category_to_keyword.get(categories[0], '') == 'web': |             if mainline_type == 'web': | ||||||
|             results.append({'title': title, |                 content = item['desc'] | ||||||
|                             'content': content, |                 results.append({ | ||||||
|                             'url': res_url}) |                     'title': title, | ||||||
|  |                     'url': res_url, | ||||||
|  |                     'content': content, | ||||||
|  |                 }) | ||||||
| 
 | 
 | ||||||
|         elif category_to_keyword.get(categories[0], '') == 'images': |             elif mainline_type == 'news': | ||||||
|             thumbnail_src = result['thumbnail'] |                 pub_date = datetime.fromtimestamp(item['date'], None) | ||||||
|             img_src = result['media'] |                 news_media = item.get('media', []) | ||||||
|             results.append({'template': 'images.html', |  | ||||||
|                             'url': res_url, |  | ||||||
|                             'title': title, |  | ||||||
|                             'content': '', |  | ||||||
|                             'thumbnail_src': thumbnail_src, |  | ||||||
|                             'img_src': img_src}) |  | ||||||
| 
 |  | ||||||
|         elif category_to_keyword.get(categories[0], '') == 'news': |  | ||||||
|             published_date = datetime.fromtimestamp(result['date'], None) |  | ||||||
|             media = result.get('media', []) |  | ||||||
|             if len(media) > 0: |  | ||||||
|                 img_src = media[0].get('pict', {}).get('url', None) |  | ||||||
|             else: |  | ||||||
|                 img_src = None |                 img_src = None | ||||||
|             results.append({'url': res_url, |                 if news_media: | ||||||
|                             'title': title, |                     img_src = news_media[0].get('pict', {}).get('url', None) | ||||||
|                             'publishedDate': published_date, |                 results.append({ | ||||||
|                             'content': content, |                     'title': title, | ||||||
|                             'img_src': img_src}) |                     'url': res_url, | ||||||
|  |                     'publishedDate': pub_date, | ||||||
|  |                     'img_src': img_src, | ||||||
|  |                 }) | ||||||
|  | 
 | ||||||
|  |             elif mainline_type == 'images': | ||||||
|  |                 thumbnail = item['thumbnail'] | ||||||
|  |                 img_src = item['media'] | ||||||
|  |                 results.append({ | ||||||
|  |                     'title': title, | ||||||
|  |                     'url': res_url, | ||||||
|  |                     'template': 'images.html', | ||||||
|  |                     'thumbnail_src': thumbnail, | ||||||
|  |                     'img_src': img_src, | ||||||
|  |                 }) | ||||||
|  | 
 | ||||||
|  |             elif mainline_type == 'videos': | ||||||
|  |                 content = item['desc'] | ||||||
|  |                 length = timedelta(seconds=item['duration']) | ||||||
|  |                 pub_date = datetime.fromtimestamp(item['date']) | ||||||
|  |                 thumbnail = item['thumbnail'] | ||||||
|  | 
 | ||||||
|  |                 results.append({ | ||||||
|  |                     'title': title, | ||||||
|  |                     'url': res_url, | ||||||
|  |                     'content': content, | ||||||
|  |                     'publishedDate': pub_date, | ||||||
|  |                     'thumbnail': thumbnail, | ||||||
|  |                     'template': 'videos.html', | ||||||
|  |                     'length':  length, | ||||||
|  |             }) | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -986,20 +986,29 @@ engines: | |||||||
|     engine: qwant |     engine: qwant | ||||||
|     shortcut: qw |     shortcut: qw | ||||||
|     categories: general |     categories: general | ||||||
|     disabled: true |     disabled: false | ||||||
|     additional_tests: |     additional_tests: | ||||||
|       rosebud: *test_rosebud |       rosebud: *test_rosebud | ||||||
| 
 | 
 | ||||||
|   - name: qwant images |  | ||||||
|     engine: qwant |  | ||||||
|     shortcut: qwi |  | ||||||
|     categories: images |  | ||||||
|     network: qwant |  | ||||||
| 
 |  | ||||||
|   - name: qwant news |   - name: qwant news | ||||||
|     engine: qwant |     engine: qwant | ||||||
|     shortcut: qwn |     shortcut: qwn | ||||||
|     categories: news |     categories: news | ||||||
|  |     disabled: false | ||||||
|  |     network: qwant | ||||||
|  | 
 | ||||||
|  |   - name: qwant images | ||||||
|  |     engine: qwant | ||||||
|  |     shortcut: qwi | ||||||
|  |     categories: images | ||||||
|  |     disabled: false | ||||||
|  |     network: qwant | ||||||
|  | 
 | ||||||
|  |   - name: qwant videos | ||||||
|  |     engine: qwant | ||||||
|  |     shortcut: qwv | ||||||
|  |     categories: videos | ||||||
|  |     disabled: false | ||||||
|     network: qwant |     network: qwant | ||||||
| 
 | 
 | ||||||
|   # - name: library |   # - name: library | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user