Merge branch 'master' into conditional-sigusr1
This commit is contained in:
		
						commit
						4a27dabcf7
					
				
							
								
								
									
										1
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Makefile
									
									
									
									
									
								
							| @ -192,6 +192,7 @@ PYLINT_FILES=\ | |||||||
| 	searx/engines/google_images.py \
 | 	searx/engines/google_images.py \
 | ||||||
| 	searx/engines/mediathekviewweb.py \
 | 	searx/engines/mediathekviewweb.py \
 | ||||||
| 	searx/engines/solidtorrents.py \
 | 	searx/engines/solidtorrents.py \
 | ||||||
|  | 	searx/engines/solr.py \
 | ||||||
| 	searx/engines/google_scholar.py \
 | 	searx/engines/google_scholar.py \
 | ||||||
| 	searx/engines/yahoo_news.py \
 | 	searx/engines/yahoo_news.py \
 | ||||||
| 	searx/engines/apkmirror.py \
 | 	searx/engines/apkmirror.py \
 | ||||||
|  | |||||||
| @ -1,74 +0,0 @@ | |||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later |  | ||||||
| """ |  | ||||||
|  Acgsou (Japanese Animation/Music/Comics Bittorrent tracker) |  | ||||||
| """ |  | ||||||
| 
 |  | ||||||
| from urllib.parse import urlencode |  | ||||||
| from lxml import html |  | ||||||
| from searx.utils import extract_text, get_torrent_size, eval_xpath_list, eval_xpath_getindex |  | ||||||
| 
 |  | ||||||
| # about |  | ||||||
| about = { |  | ||||||
|     "website": 'https://www.acgsou.com/', |  | ||||||
|     "wikidata_id": None, |  | ||||||
|     "official_api_documentation": None, |  | ||||||
|     "use_official_api": False, |  | ||||||
|     "require_api_key": False, |  | ||||||
|     "results": 'HTML', |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| # engine dependent config |  | ||||||
| categories = ['files', 'images', 'videos', 'music'] |  | ||||||
| paging = True |  | ||||||
| 
 |  | ||||||
| # search-url |  | ||||||
| base_url = 'https://www.acgsou.com/' |  | ||||||
| search_url = base_url + 'search.php?{query}&page={offset}' |  | ||||||
| # xpath queries |  | ||||||
| xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]' |  | ||||||
| xpath_category = './/td[2]/a[1]' |  | ||||||
| xpath_title = './/td[3]/a[last()]' |  | ||||||
| xpath_torrent_links = './/td[3]/a' |  | ||||||
| xpath_filesize = './/td[4]/text()' |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def request(query, params): |  | ||||||
|     query = urlencode({'keyword': query}) |  | ||||||
|     params['url'] = search_url.format(query=query, offset=params['pageno']) |  | ||||||
|     return params |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def response(resp): |  | ||||||
|     results = [] |  | ||||||
|     dom = html.fromstring(resp.text) |  | ||||||
|     for result in eval_xpath_list(dom, xpath_results): |  | ||||||
|         # defaults |  | ||||||
|         filesize = 0 |  | ||||||
|         magnet_link = "magnet:?xt=urn:btih:{}&tr=https://tracker.acgsou.com:2710/announce" |  | ||||||
| 
 |  | ||||||
|         category = extract_text(eval_xpath_getindex(result, xpath_category, 0, default=[])) |  | ||||||
|         page_a = eval_xpath_getindex(result, xpath_title, 0) |  | ||||||
|         title = extract_text(page_a) |  | ||||||
|         href = base_url + page_a.attrib.get('href') |  | ||||||
| 
 |  | ||||||
|         magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5]) |  | ||||||
| 
 |  | ||||||
|         filesize_info = eval_xpath_getindex(result, xpath_filesize, 0, default=None) |  | ||||||
|         if filesize_info: |  | ||||||
|             try: |  | ||||||
|                 filesize = filesize_info[:-2] |  | ||||||
|                 filesize_multiplier = filesize_info[-2:] |  | ||||||
|                 filesize = get_torrent_size(filesize, filesize_multiplier) |  | ||||||
|             except: |  | ||||||
|                 pass |  | ||||||
|         # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime |  | ||||||
|         content = 'Category: "{category}".' |  | ||||||
|         content = content.format(category=category) |  | ||||||
| 
 |  | ||||||
|         results.append({'url': href, |  | ||||||
|                         'title': title, |  | ||||||
|                         'content': content, |  | ||||||
|                         'filesize': filesize, |  | ||||||
|                         'magnetlink': magnet_link, |  | ||||||
|                         'template': 'torrent.html'}) |  | ||||||
|     return results |  | ||||||
| @ -3,10 +3,7 @@ | |||||||
|  Microsoft Academic (Science) |  Microsoft Academic (Science) | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from datetime import datetime | from json import dumps, loads | ||||||
| from json import loads |  | ||||||
| from uuid import uuid4 |  | ||||||
| from urllib.parse import urlencode |  | ||||||
| from searx.utils import html_to_text | from searx.utils import html_to_text | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| @ -21,26 +18,25 @@ about = { | |||||||
| 
 | 
 | ||||||
| categories = ['images'] | categories = ['images'] | ||||||
| paging = True | paging = True | ||||||
| result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}' | search_url = 'https://academic.microsoft.com/api/search' | ||||||
|  | _paper_url = 'https://academic.microsoft.com/paper/{id}/reference' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     correlation_id = uuid4() |     params['url'] = search_url | ||||||
|     msacademic = uuid4() |  | ||||||
|     time_now = datetime.now() |  | ||||||
| 
 |  | ||||||
|     params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id})) |  | ||||||
|     params['cookies']['msacademic'] = str(msacademic) |  | ||||||
|     params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now)) |  | ||||||
|     params['method'] = 'POST' |     params['method'] = 'POST' | ||||||
|     params['data'] = { |     params['headers']['content-type'] = 'application/json; charset=utf-8' | ||||||
|         'Query': '@{query}@'.format(query=query), |     params['data'] = dumps({ | ||||||
|         'Limit': 10, |         'query': query, | ||||||
|         'Offset': params['pageno'] - 1, |         'queryExpression': '', | ||||||
|         'Filters': '', |         'filters': [], | ||||||
|         'OrderBy': '', |         'orderBy': 0, | ||||||
|         'SortAscending': False, |         'skip': (params['pageno'] - 1) * 10, | ||||||
|     } |         'sortAscending': True, | ||||||
|  |         'take': 10, | ||||||
|  |         'includeCitationContexts': False, | ||||||
|  |         'profileId': '', | ||||||
|  |     }) | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| @ -51,10 +47,13 @@ def response(resp): | |||||||
|     if not response_data: |     if not response_data: | ||||||
|         return results |         return results | ||||||
| 
 | 
 | ||||||
|     for result in response_data['results']: |     for result in response_data['pr']: | ||||||
|         url = _get_url(result) |         if 'dn' not in result['paper']: | ||||||
|         title = result['e']['dn'] |             continue | ||||||
|         content = _get_content(result) | 
 | ||||||
|  |         title = result['paper']['dn'] | ||||||
|  |         content = _get_content(result['paper']) | ||||||
|  |         url = _paper_url.format(id=result['paper']['id']) | ||||||
|         results.append({ |         results.append({ | ||||||
|             'url': url, |             'url': url, | ||||||
|             'title': html_to_text(title), |             'title': html_to_text(title), | ||||||
| @ -64,15 +63,9 @@ def response(resp): | |||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _get_url(result): |  | ||||||
|     if 's' in result['e']: |  | ||||||
|         return result['e']['s'][0]['u'] |  | ||||||
|     return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id']) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def _get_content(result): | def _get_content(result): | ||||||
|     if 'd' in result['e']: |     if 'd' in result: | ||||||
|         content = result['e']['d'] |         content = result['d'] | ||||||
|         if len(content) > 300: |         if len(content) > 300: | ||||||
|             return content[:300] + '...' |             return content[:300] + '...' | ||||||
|         return content |         return content | ||||||
|  | |||||||
							
								
								
									
										74
									
								
								searx/engines/solr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								searx/engines/solr.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,74 @@ | |||||||
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | """ | ||||||
|  |  Solr | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | # pylint: disable=global-statement, missing-function-docstring | ||||||
|  | 
 | ||||||
|  | from json import loads | ||||||
|  | from urllib.parse import urlencode | ||||||
|  | from searx.exceptions import SearxEngineAPIException | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | base_url = 'http://localhost:8983' | ||||||
|  | collection = '' | ||||||
|  | rows = 10 | ||||||
|  | sort = '' # sorting: asc or desc | ||||||
|  | field_list = 'name' # list of field names to display on the UI | ||||||
|  | default_fields = '' # default field to query | ||||||
|  | query_fields = '' # query fields | ||||||
|  | _search_url = '' | ||||||
|  | paging = True | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def init(_): | ||||||
|  |     if collection == '': | ||||||
|  |         raise ValueError('collection cannot be empty') | ||||||
|  | 
 | ||||||
|  |     global _search_url | ||||||
|  |     _search_url = base_url + '/solr/' + collection + '/select?{params}' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def request(query, params): | ||||||
|  |     query_params = {'q': query, 'rows': rows} | ||||||
|  |     if field_list != '': | ||||||
|  |         query_params['fl'] = field_list | ||||||
|  |     if query_fields != '': | ||||||
|  |         query_params['qf'] = query_fields | ||||||
|  |     if default_fields != '': | ||||||
|  |         query_params['df'] = default_fields | ||||||
|  |     if sort != '': | ||||||
|  |         query_params['sort'] = sort | ||||||
|  | 
 | ||||||
|  |     if 'pageno' in params: | ||||||
|  |         query_params['start'] = rows * (params['pageno'] - 1) | ||||||
|  | 
 | ||||||
|  |     params['url'] = _search_url.format(params=urlencode(query_params)) | ||||||
|  | 
 | ||||||
|  |     return params | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def response(resp): | ||||||
|  |     resp_json = __get_response(resp) | ||||||
|  | 
 | ||||||
|  |     results = [] | ||||||
|  |     for result in resp_json['response']['docs']: | ||||||
|  |         r = {key: str(value) for key, value in result.items()} | ||||||
|  |         if len(r) == 0: | ||||||
|  |             continue | ||||||
|  |         r['template'] = 'key-value.html' | ||||||
|  |         results.append(r) | ||||||
|  | 
 | ||||||
|  |     return results | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def __get_response(resp): | ||||||
|  |     try: | ||||||
|  |         resp_json = loads(resp.text) | ||||||
|  |     except Exception as e: | ||||||
|  |         raise SearxEngineAPIException("failed to parse response") from e | ||||||
|  | 
 | ||||||
|  |     if 'error' in resp_json: | ||||||
|  |         raise SearxEngineAPIException(resp_json['error']['msg']) | ||||||
|  | 
 | ||||||
|  |     return resp_json | ||||||
| @ -22,13 +22,6 @@ from searx.utils import ( | |||||||
| 
 | 
 | ||||||
| from searx.engines.yahoo import parse_url | from searx.engines.yahoo import parse_url | ||||||
| 
 | 
 | ||||||
| # pylint: disable=unused-import |  | ||||||
| from searx.engines.yahoo import ( |  | ||||||
|     _fetch_supported_languages, |  | ||||||
|     supported_languages_url, |  | ||||||
| ) |  | ||||||
| # pylint: enable=unused-import |  | ||||||
| 
 |  | ||||||
| logger = logger.getChild('yahoo_news engine') | logger = logger.getChild('yahoo_news engine') | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
|  | |||||||
| @ -4,7 +4,7 @@ | |||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from functools import reduce | from functools import reduce | ||||||
| from json import loads | from json import loads, dumps | ||||||
| from urllib.parse import quote_plus | from urllib.parse import quote_plus | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| @ -20,12 +20,15 @@ about = { | |||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['videos', 'music'] | categories = ['videos', 'music'] | ||||||
| paging = True | paging = True | ||||||
|  | language_support = False | ||||||
| time_range_support = True | time_range_support = True | ||||||
| 
 | 
 | ||||||
| # search-url | # search-url | ||||||
| base_url = 'https://www.youtube.com/results' | base_url = 'https://www.youtube.com/results' | ||||||
| search_url = base_url + '?search_query={query}&page={page}' | search_url = base_url + '?search_query={query}&page={page}' | ||||||
| time_range_url = '&sp=EgII{time_range}%253D%253D' | time_range_url = '&sp=EgII{time_range}%253D%253D' | ||||||
|  | # the key seems to be constant | ||||||
|  | next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' | ||||||
| time_range_dict = {'day': 'Ag', | time_range_dict = {'day': 'Ag', | ||||||
|                    'week': 'Aw', |                    'week': 'Aw', | ||||||
|                    'month': 'BA', |                    'month': 'BA', | ||||||
| @ -40,21 +43,73 @@ base_youtube_url = 'https://www.youtube.com/watch?v=' | |||||||
| 
 | 
 | ||||||
| # do search-request | # do search-request | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     params['url'] = search_url.format(query=quote_plus(query), |     if not params['engine_data'].get('next_page_token'): | ||||||
|                                       page=params['pageno']) |         params['url'] = search_url.format(query=quote_plus(query), page=params['pageno']) | ||||||
|     if params['time_range'] in time_range_dict: |         if params['time_range'] in time_range_dict: | ||||||
|         params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) |             params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) | ||||||
|  |     else: | ||||||
|  |         print(params['engine_data']['next_page_token']) | ||||||
|  |         params['url'] = next_page_url | ||||||
|  |         params['method'] = 'POST' | ||||||
|  |         params['data'] = dumps({ | ||||||
|  |             'context': {"client": {"clientName": "WEB", "clientVersion": "2.20210310.12.01"}}, | ||||||
|  |             'continuation': params['engine_data']['next_page_token'], | ||||||
|  |         }) | ||||||
|  |         params['headers']['Content-Type'] = 'application/json' | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # get response from search-request | # get response from search-request | ||||||
| def response(resp): | def response(resp): | ||||||
|  |     if resp.search_params.get('engine_data'): | ||||||
|  |         return parse_next_page_response(resp.text) | ||||||
|  |     return parse_first_page_response(resp.text) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def parse_next_page_response(response_text): | ||||||
|     results = [] |     results = [] | ||||||
|  |     result_json = loads(response_text) | ||||||
|  |     with open("/tmp/x", "w") as f: | ||||||
|  |         f.write(response_text) | ||||||
|  |     for section in (result_json['onResponseReceivedCommands'][0] | ||||||
|  |                     .get('appendContinuationItemsAction')['continuationItems'][0] | ||||||
|  |                     .get('itemSectionRenderer')['contents']): | ||||||
|  |         if 'videoRenderer' not in section: | ||||||
|  |             continue | ||||||
|  |         section = section['videoRenderer'] | ||||||
|  |         content = "-" | ||||||
|  |         if 'descriptionSnippet' in section: | ||||||
|  |             content = ' '.join(x['text'] for x in section['descriptionSnippet']['runs']) | ||||||
|  |         results.append({ | ||||||
|  |             'url': base_youtube_url + section['videoId'], | ||||||
|  |             'title': ' '.join(x['text'] for x in section['title']['runs']), | ||||||
|  |             'content': content, | ||||||
|  |             'author': section['ownerText']['runs'][0]['text'], | ||||||
|  |             'length': section['lengthText']['simpleText'], | ||||||
|  |             'template': 'videos.html', | ||||||
|  |             'embedded': embedded_url.format(videoid=section['videoId']), | ||||||
|  |             'thumbnail': section['thumbnail']['thumbnails'][-1]['url'], | ||||||
|  |         }) | ||||||
|  |     try: | ||||||
|  |         token = result_json['onResponseReceivedCommands'][0]\ | ||||||
|  |             .get('appendContinuationItemsAction')['continuationItems'][1]\ | ||||||
|  |             .get('continuationItemRenderer')['continuationEndpoint']\ | ||||||
|  |             .get('continuationCommand')['token'] | ||||||
|  |         results.append({ | ||||||
|  |             "engine_data": token, | ||||||
|  |             "key": "next_page_token", | ||||||
|  |         }) | ||||||
|  |     except: | ||||||
|  |         pass | ||||||
| 
 | 
 | ||||||
|     results_data = resp.text[resp.text.find('ytInitialData'):] |     return results | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def parse_first_page_response(response_text): | ||||||
|  |     results = [] | ||||||
|  |     results_data = response_text[response_text.find('ytInitialData'):] | ||||||
|     results_data = results_data[results_data.find('{'):results_data.find(';</script>')] |     results_data = results_data[results_data.find('{'):results_data.find(';</script>')] | ||||||
| 
 |  | ||||||
|     results_json = loads(results_data) if results_data else {} |     results_json = loads(results_data) if results_data else {} | ||||||
|     sections = results_json.get('contents', {})\ |     sections = results_json.get('contents', {})\ | ||||||
|                            .get('twoColumnSearchResultsRenderer', {})\ |                            .get('twoColumnSearchResultsRenderer', {})\ | ||||||
| @ -63,6 +118,16 @@ def response(resp): | |||||||
|                            .get('contents', []) |                            .get('contents', []) | ||||||
| 
 | 
 | ||||||
|     for section in sections: |     for section in sections: | ||||||
|  |         if "continuationItemRenderer" in section: | ||||||
|  |             next_page_token = section["continuationItemRenderer"]\ | ||||||
|  |                 .get("continuationEndpoint", {})\ | ||||||
|  |                 .get("continuationCommand", {})\ | ||||||
|  |                 .get("token", "") | ||||||
|  |             if next_page_token: | ||||||
|  |                 results.append({ | ||||||
|  |                     "engine_data": next_page_token, | ||||||
|  |                     "key": "next_page_token", | ||||||
|  |                 }) | ||||||
|         for video_container in section.get('itemSectionRenderer', {}).get('contents', []): |         for video_container in section.get('itemSectionRenderer', {}).get('contents', []): | ||||||
|             video = video_container.get('videoRenderer', {}) |             video = video_container.get('videoRenderer', {}) | ||||||
|             videoid = video.get('videoId') |             videoid = video.get('videoId') | ||||||
|  | |||||||
| @ -82,6 +82,8 @@ outgoing: # communication with search engines | |||||||
| #        https: | #        https: | ||||||
| #            - http://proxy1:8080 | #            - http://proxy1:8080 | ||||||
| #            - http://proxy2:8080 | #            - http://proxy2:8080 | ||||||
|  | #    using_tor_proxy : True | ||||||
|  | #    extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy | ||||||
| # uncomment below section only if you have more than one network interface | # uncomment below section only if you have more than one network interface | ||||||
| # which can be the source of outgoing search requests | # which can be the source of outgoing search requests | ||||||
| #    source_ips: | #    source_ips: | ||||||
| @ -159,6 +161,7 @@ engines: | |||||||
|   - name : ahmia |   - name : ahmia | ||||||
|     engine : ahmia |     engine : ahmia | ||||||
|     categories : onions |     categories : onions | ||||||
|  |     enable_http : True | ||||||
|     shortcut : ah |     shortcut : ah | ||||||
| 
 | 
 | ||||||
|   - name : arch linux wiki |   - name : arch linux wiki | ||||||
| @ -730,6 +733,8 @@ engines: | |||||||
| # Requires Tor | # Requires Tor | ||||||
|   - name : not evil |   - name : not evil | ||||||
|     engine : not_evil |     engine : not_evil | ||||||
|  |     categories : onions | ||||||
|  |     enable_http : True | ||||||
|     shortcut : ne |     shortcut : ne | ||||||
| 
 | 
 | ||||||
|   - name : nyaa |   - name : nyaa | ||||||
| @ -737,12 +742,6 @@ engines: | |||||||
|     shortcut : nt |     shortcut : nt | ||||||
|     disabled : True |     disabled : True | ||||||
| 
 | 
 | ||||||
|   - name : acgsou |  | ||||||
|     engine : acgsou |  | ||||||
|     shortcut : acg |  | ||||||
|     disabled : True |  | ||||||
|     timeout: 5.0 |  | ||||||
| 
 |  | ||||||
|   - name : openairedatasets |   - name : openairedatasets | ||||||
|     engine : json_engine |     engine : json_engine | ||||||
|     paging : True |     paging : True | ||||||
| @ -943,6 +942,17 @@ engines: | |||||||
| #    api_client_id : ******* | #    api_client_id : ******* | ||||||
| #    api_client_secret : ******* | #    api_client_secret : ******* | ||||||
| 
 | 
 | ||||||
|  | #  - name : solr | ||||||
|  | #    engine : solr | ||||||
|  | #    shortcut : slr | ||||||
|  | #    base_url : http://localhost:8983 | ||||||
|  | #    collection : collection_name | ||||||
|  | #    sort : '' # sorting: asc or desc | ||||||
|  | #    field_list : '' # comma separated list of field names to display on the UI | ||||||
|  | #    default_fields : '' # default field to query | ||||||
|  | #    query_fields : '' # query fields | ||||||
|  | #    enable_http : True | ||||||
|  | 
 | ||||||
|   - name : startpage |   - name : startpage | ||||||
|     engine : startpage |     engine : startpage | ||||||
|     shortcut : sp |     shortcut : sp | ||||||
| @ -979,6 +989,7 @@ engines: | |||||||
|     title_xpath : ./td[2]/b |     title_xpath : ./td[2]/b | ||||||
|     content_xpath : ./td[2]/small |     content_xpath : ./td[2]/small | ||||||
|     categories : onions |     categories : onions | ||||||
|  |     enable_http : True | ||||||
|     shortcut : tch |     shortcut : tch | ||||||
| 
 | 
 | ||||||
| # maybe in a fun category | # maybe in a fun category | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user