From a96f503d7b4866e6eb352afd759433b3aad0a3f5 Mon Sep 17 00:00:00 2001 From: Alexandre FLAMENT Date: Fri, 26 Aug 2022 16:04:50 +0000 Subject: [PATCH 1/7] Add searx.webutils.searxng_format_date * Move the datetime to str code from searx.webapp.search to searx.webutils.searxng_format_date * When the month, day, hour, day and second are zero, the function returns only the year. --- searx/webapp.py | 17 ++--------------- searx/webutils.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/searx/webapp.py b/searx/webapp.py index bd76cc534..e6bda42be 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -12,7 +12,6 @@ import os import sys import base64 -from datetime import datetime, timedelta from timeit import default_timer from html import escape from io import StringIO @@ -45,7 +44,6 @@ from flask.json import jsonify from flask_babel import ( Babel, gettext, - format_date, format_decimal, ) @@ -79,6 +77,7 @@ from searx.webutils import ( is_hmac_of, is_flask_run_cmdline, group_engines_in_tab, + searxng_format_date, ) from searx.webadapter import ( get_search_query_from_webapp, @@ -718,25 +717,13 @@ def search(): if 'url' in result: result['pretty_url'] = prettify_url(result['url']) - # TODO, check if timezone is calculated right # pylint: disable=fixme if result.get('publishedDate'): # do not try to get a date from an empty string or a None type try: # test if publishedDate >= 1900 (datetime module bug) result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') except ValueError: result['publishedDate'] = None else: - if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): - timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) - minutes = int((timedifference.seconds / 60) % 60) - hours = int(timedifference.seconds / 60 / 60) - if hours == 0: - result['publishedDate'] = gettext('{minutes} minute(s) ago').format(minutes=minutes) - else: - result['publishedDate'] = gettext('{hours} hour(s), {minutes} minute(s) ago').format( - hours=hours, minutes=minutes - ) - else: - result['publishedDate'] = format_date(result['publishedDate']) + result['publishedDate'] = searxng_format_date(result['publishedDate']) # set result['open_group'] = True when the template changes from the previous result # set result['close_group'] = True when the template changes on the next result diff --git a/searx/webutils.py b/searx/webutils.py index b18fd5c6a..f084fe9d3 100644 --- a/searx/webutils.py +++ b/searx/webutils.py @@ -7,11 +7,14 @@ import hmac import re import inspect import itertools +from datetime import datetime, timedelta from typing import Iterable, List, Tuple, Dict from io import StringIO from codecs import getincrementalencoder +from flask_babel import gettext, format_date + from searx import logger, settings from searx.engines import Engine, OTHER_CATEGORY @@ -138,6 +141,22 @@ def highlight_content(content, query): return content +def searxng_format_date(dt: datetime): # pylint: disable=invalid-name + # TODO, check if timezone is calculated right # pylint: disable=fixme + d = dt.date() + t = dt.time() + if d.month == 1 and d.day == 1 and t.hour == 0 and t.minute == 0 and t.second == 0: + return str(d.year) + if dt.replace(tzinfo=None) >= datetime.now() - timedelta(days=1): + timedifference = datetime.now() - dt.replace(tzinfo=None) + minutes = int((timedifference.seconds / 60) % 60) + hours = int(timedifference.seconds / 60 / 60) + if hours == 0: + return gettext('{minutes} minute(s) ago').format(minutes=minutes) + return gettext('{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) + return format_date(dt) + + def is_flask_run_cmdline(): """Check if the application was started using "flask run" command line From 5ba831d6a88bca617d984593f6710d0c18bae120 Mon Sep 17 00:00:00 2001 From: Alexandre FLAMENT Date: Fri, 26 Aug 2022 16:07:18 +0000 Subject: [PATCH 2/7] Add paper.html result template --- .../static/themes/simple/src/less/style.less | 65 +++++++++++++++++++ .../simple/result_templates/paper.html | 44 +++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 searx/templates/simple/result_templates/paper.html diff --git a/searx/static/themes/simple/src/less/style.less b/searx/static/themes/simple/src/less/style.less index 11d2ef58d..dd8e8a596 100644 --- a/searx/static/themes/simple/src/less/style.less +++ b/searx/static/themes/simple/src/less/style.less @@ -302,6 +302,49 @@ article[data-vim-selected].category-social { } } +.result-paper { + .attributes { + display: table; + border-spacing: 0.125rem; + + div { + display: table-row; + + span { + font-size: 0.9rem; + margin-top: 0.25rem; + display: table-cell; + + time { + font-size: 0.9rem; + } + } + + span:first-child { + color: var(--color-base-font); + min-width: 10rem; + } + + span:nth-child(2) { + color: var(--color-result-publishdate-font); + } + } + } + + .content { + margin-top: 0.25rem; + } + + .comments { + font-size: 0.9rem; + margin: 0.25rem 0 0 0; + padding: 0; + word-wrap: break-word; + line-height: 1.24; + font-style: italic; + } +} + .template_group_images { display: flex; flex-wrap: wrap; @@ -955,6 +998,28 @@ article[data-vim-selected].category-social { border: none !important; background-color: var(--color-sidebar-background); } + + .result-paper { + .attributes { + display: block; + + div { + display: block; + + span { + display: inline; + } + + span:first-child { + font-weight: bold; + } + + span:nth-child(2) { + .ltr-margin-left(0.5rem); + } + } + } + } } /* diff --git a/searx/templates/simple/result_templates/paper.html b/searx/templates/simple/result_templates/paper.html new file mode 100644 index 000000000..3ede1b250 --- /dev/null +++ b/searx/templates/simple/result_templates/paper.html @@ -0,0 +1,44 @@ +{% from 'simple/macros.html' import result_header, result_sub_header, result_sub_footer, result_footer with context %} + +{{ result_header(result, favicons, image_proxify) -}} +
+ {%- if result.publishedDate %}
{{ _("Published date") }}:
{% endif -%} + {%- if result.authors %}
{{ _("Author") }}:{{ result.authors | join(", ") }}
{% endif -%} + {%- if result.journal -%} +
+ {{- _("Journal") }}:{{ result.journal -}} + {%- if result.volume -%} +  {{- result.volume -}} + {%- if result.number -%} + .{{- result.number -}} + {%- endif -%} + {%- endif -%} + {%- if result.start_page -%} +  {{- result.start_page -}} / {{- result.end_page -}} + {%- endif -%} + +
+ {%- endif %} + {%- if result.editor %}
{{ _("Editor") }}:{{ result.editor }}
{% endif -%} + {%- if result.publisher %}
{{ _("Publisher") }}:{{ result.publisher }}
{% endif -%} + {%- if result.type %}
{{ _("Type") }}:{{ result.type }}
{% endif -%} + {%- if result.tags %}
{{ _("Tags") }}:{{ result.tags | join(", ")}}
{%- endif -%} + {%- if result.doi %}
{{ _("DOI") }}:{{- result.doi -}}
{% endif -%} + {%- if result.issn %}
{{ _("ISSN") }}:{{ result.issn | join(", ") }}
{% endif -%} + {%- if result.isbn %}
{{ _("ISBN") }}:{{ result.isbn | join(", ") }}
{% endif -%} +
+{%- if result.content -%}

{{- result.content | safe -}}

{%- endif -%} +{%- if result.comments -%}

{{- result.comments -}}

{%- endif -%} + +{{- result_sub_footer(result, proxify) -}} +{{- result_footer(result) }} From 593026ad9cd024fd7b3182d48f274aa41b374c74 Mon Sep 17 00:00:00 2001 From: Alexandre FLAMENT Date: Fri, 26 Aug 2022 16:07:38 +0000 Subject: [PATCH 3/7] oa_doi_rewrite: add the doi to the result when it is found. Currentty, when oa_doi_rewrite find a DOI in the result URL, it replace the URL. In this commit, the plugin adds the key "doi" to the result, so the paper.html can show it. --- searx/plugins/oa_doi_rewrite.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/searx/plugins/oa_doi_rewrite.py b/searx/plugins/oa_doi_rewrite.py index 54d28bc9a..f0e07735d 100644 --- a/searx/plugins/oa_doi_rewrite.py +++ b/searx/plugins/oa_doi_rewrite.py @@ -42,4 +42,6 @@ def on_result(request, search, result): doi = doi[: -len(suffix)] result['url'] = get_doi_resolver(request.preferences) + doi result['parsed_url'] = urlparse(result['url']) + if 'doi' not in result: + result['doi'] = doi return True From e36f85b8365e5d6a9263dd78242a10a305a9000c Mon Sep 17 00:00:00 2001 From: Alexandre FLAMENT Date: Fri, 26 Aug 2022 16:10:12 +0000 Subject: [PATCH 4/7] Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py --- searx/engines/arxiv.py | 74 ++++++++++++++++++------ searx/engines/crossref.py | 59 +++++++++++++++++++ searx/engines/google_scholar.py | 85 +++++++++++++++++++++++---- searx/engines/pubmed.py | 95 ++++++++++++++++++------------- searx/engines/semantic_scholar.py | 57 +++++++++++++------ searx/engines/springer.py | 38 ++++++------- searx/searxng.msg | 1 + searx/settings.yml | 26 ++------- 8 files changed, 309 insertions(+), 126 deletions(-) create mode 100644 searx/engines/crossref.py diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index a1a58172d..a4811ebd5 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -3,9 +3,10 @@ ArXiV (Scientific preprints) """ -from lxml import html +from lxml import etree +from lxml.etree import XPath from datetime import datetime -from searx.utils import eval_xpath_list, eval_xpath_getindex +from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex # about about = { @@ -17,7 +18,7 @@ about = { "results": 'XML-RSS', } -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True base_url = ( @@ -27,6 +28,23 @@ base_url = ( # engine dependent config number_of_results = 10 +# xpaths +arxiv_namespaces = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", +} +xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces) +xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces) +xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces) +xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces) +xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces) +xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces) +xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces) +xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces) +xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces) +xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces) +xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces) + def request(query, params): # basic search @@ -41,30 +59,50 @@ def request(query, params): def response(resp): results = [] + dom = etree.fromstring(resp.content) + for entry in eval_xpath_list(dom, xpath_entry): + title = eval_xpath_getindex(entry, xpath_title, 0).text - dom = html.fromstring(resp.content) + url = eval_xpath_getindex(entry, xpath_id, 0).text + abstract = eval_xpath_getindex(entry, xpath_summary, 0).text - for entry in eval_xpath_list(dom, '//entry'): - title = eval_xpath_getindex(entry, './/title', 0).text + authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)] - url = eval_xpath_getindex(entry, './/id', 0).text + # doi + doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None) + doi = None if doi_element is None else doi_element.text - content_string = '{doi_content}{abstract_content}' + # pdf + pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None) + pdf_url = None if pdf_element is None else pdf_element.attrib.get('href') - abstract = eval_xpath_getindex(entry, './/summary', 0).text + # journal + journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None) + journal = None if journal_element is None else journal_element.text - # If a doi is available, add it to the snipppet - doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None) - doi_content = doi_element.text if doi_element is not None else '' - content = content_string.format(doi_content=doi_content, abstract_content=abstract) + # tags + tag_elements = eval_xpath(entry, xpath_category) + tags = [str(tag) for tag in tag_elements] - if len(content) > 300: - content = content[0:300] + "..." - # TODO: center snippet on query term + # comments + comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None) + comments = None if comments_elements is None else comments_elements.text - publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ') + publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ') - res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content} + res_dict = { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': abstract, + 'doi': doi, + 'authors': authors, + 'journal': journal, + 'tags': tags, + 'comments': comments, + 'pdf_url': pdf_url, + } results.append(res_dict) diff --git a/searx/engines/crossref.py b/searx/engines/crossref.py new file mode 100644 index 000000000..d61318146 --- /dev/null +++ b/searx/engines/crossref.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Semantic Scholar (Science) +""" + +from urllib.parse import urlencode +from searx.utils import html_to_text + +about = { + "website": 'https://www.crossref.org/', + "wikidata_id": 'Q5188229', + "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['science', 'scientific publications'] +paging = True +search_url = 'https://api.crossref.org/works' + + +def request(query, params): + params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1))) + return params + + +def response(resp): + res = resp.json() + results = [] + for record in res['message']['items']: + record_type = record['type'] + if record_type == 'book-chapter': + title = record['container-title'][0] + if record['title'][0].lower().strip() != title.lower().strip(): + title = title + ' (' + record['title'][0] + ')' + journal = None + else: + title = record['title'][0] + journal = record.get('container-title', [None])[0] + url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL'] + authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])] + isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])] + results.append( + { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'journal': journal, + 'volume': record.get('volume'), + 'type': record['type'], + 'content': html_to_text(record.get('abstract', '')), + 'publisher': record.get('publisher'), + 'authors': authors, + 'doi': record['DOI'], + 'isbn': isbn, + } + ) + return results diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index 41c62886b..c07cd4cea 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -13,10 +13,12 @@ Definitions`_. from urllib.parse import urlencode from datetime import datetime +from typing import Optional from lxml import html from searx.utils import ( eval_xpath, + eval_xpath_getindex, eval_xpath_list, extract_text, ) @@ -46,7 +48,7 @@ about = { } # engine dependent config -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True language_support = True use_locale_domain = True @@ -99,7 +101,43 @@ def request(query, params): return params -def response(resp): +def parse_gs_a(text: Optional[str]): + """Parse the text written in green. + + Possible formats: + * "{authors} - {journal}, {year} - {publisher}" + * "{authors} - {year} - {publisher}" + * "{authors} - {publisher}" + """ + if text is None or text == "": + return None, None, None, None + + s_text = text.split(' - ') + authors = s_text[0].split(', ') + publisher = s_text[-1] + if len(s_text) != 3: + return authors, None, publisher, None + + # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}" + # get journal and year + journal_year = s_text[1].split(', ') + # journal is optional and may contains some coma + if len(journal_year) > 1: + journal = ', '.join(journal_year[0:-1]) + if journal == '…': + journal = None + else: + journal = None + # year + year = journal_year[-1] + try: + publishedDate = datetime.strptime(year.strip(), '%Y') + except ValueError: + publishedDate = None + return authors, journal, publisher, publishedDate + + +def response(resp): # pylint: disable=too-many-locals """Get response from google's search request""" results = [] @@ -112,30 +150,53 @@ def response(resp): dom = html.fromstring(resp.text) # parse results - for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): + for result in eval_xpath_list(dom, '//div[@data-cid]'): - title = extract_text(eval_xpath(result, './h3[1]//a')) + title = extract_text(eval_xpath(result, './/h3[1]//a')) if not title: # this is a [ZITATION] block continue - url = eval_xpath(result, './h3[1]//a/@href')[0] - content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or '' - - pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]')) - if pub_info: - content += "[%s]" % pub_info - pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) if pub_type: - title = title + " " + pub_type + pub_type = pub_type[1:-1].lower() + + url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0) + content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]')) + authors, journal, publisher, publishedDate = parse_gs_a( + extract_text(eval_xpath(result, './/div[@class="gs_a"]')) + ) + if publisher in url: + publisher = None + + # cited by + comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]')) + + # link to the html or pdf document + html_url = None + pdf_url = None + doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None) + doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) + if doc_type == "[PDF]": + pdf_url = doc_url + else: + html_url = doc_url results.append( { + 'template': 'paper.html', + 'type': pub_type, 'url': url, 'title': title, + 'authors': authors, + 'publisher': publisher, + 'journal': journal, + 'publishedDate': publishedDate, 'content': content, + 'comments': comments, + 'html_url': html_url, + 'pdf_url': pdf_url, } ) diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index 27444ae24..02e282d5f 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -3,11 +3,15 @@ PubMed (Scholar publications) """ -from flask_babel import gettext from lxml import etree from datetime import datetime from urllib.parse import urlencode from searx.network import get +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, +) # about about = { @@ -22,7 +26,7 @@ about = { "results": 'XML', } -categories = ['science'] +categories = ['science', 'scientific publications'] base_url = ( 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' @@ -63,46 +67,61 @@ def response(resp): retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) - search_results_xml = get(retrieve_url_encoded).content - search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') + search_results_response = get(retrieve_url_encoded).content + search_results = etree.XML(search_results_response) + for entry in eval_xpath_list(search_results, '//PubmedArticle'): + medline = eval_xpath_getindex(entry, './MedlineCitation', 0) - for entry in search_results: - title = entry.xpath('.//Article/ArticleTitle')[0].text - - pmid = entry.xpath('.//PMID')[0].text + title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text + pmid = eval_xpath_getindex(medline, './/PMID', 0).text url = pubmed_url + pmid + content = extract_text( + eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True + ) + doi = extract_text( + eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True + ) + journal = extract_text( + eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True + ) + issn = extract_text( + eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True + ) + authors = [] + for author in eval_xpath_list(medline, './Article/AuthorList/Author'): + f = eval_xpath_getindex(author, './ForeName', 0, default=None) + l = eval_xpath_getindex(author, './LastName', 0, default=None) + f = '' if f is None else f.text + l = '' if l is None else l.text + authors.append((f + ' ' + l).strip()) - try: - content = entry.xpath('.//Abstract/AbstractText')[0].text - except: - content = gettext('No abstract is available for this publication.') + res_dict = { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'content': content, + 'journal': journal, + 'issn': [issn], + 'authors': authors, + 'doi': doi, + } - # If a doi is available, add it to the snipppet - try: - doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text - content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) - except: - pass - - if len(content) > 300: - content = content[0:300] + "..." - # TODO: center snippet on query term - - res_dict = {'url': url, 'title': title, 'content': content} - - try: - publishedDate = datetime.strptime( - entry.xpath('.//DateCreated/Year')[0].text - + '-' - + entry.xpath('.//DateCreated/Month')[0].text - + '-' - + entry.xpath('.//DateCreated/Day')[0].text, - '%Y-%m-%d', - ) - res_dict['publishedDate'] = publishedDate - except: - pass + accepted_date = eval_xpath_getindex( + entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None + ) + if accepted_date is not None: + year = eval_xpath_getindex(accepted_date, './Year', 0) + month = eval_xpath_getindex(accepted_date, './Month', 0) + day = eval_xpath_getindex(accepted_date, './Day', 0) + try: + publishedDate = datetime.strptime( + year.text + '-' + month.text + '-' + day.text, + '%Y-%m-%d', + ) + res_dict['publishedDate'] = publishedDate + except Exception as e: + print(e) results.append(res_dict) - return results + return results diff --git a/searx/engines/semantic_scholar.py b/searx/engines/semantic_scholar.py index bda731047..b2701c333 100644 --- a/searx/engines/semantic_scholar.py +++ b/searx/engines/semantic_scholar.py @@ -6,6 +6,8 @@ from json import dumps, loads from datetime import datetime +from flask_babel import gettext + about = { "website": 'https://www.semanticscholar.org/', "wikidata_id": 'Q22908627', @@ -15,6 +17,7 @@ about = { "results": 'JSON', } +categories = ['science', 'scientific publications'] paging = True search_url = 'https://www.semanticscholar.org/api/1/search' paper_url = 'https://www.semanticscholar.org/paper' @@ -47,9 +50,6 @@ def response(resp): results = [] for result in res['results']: - item = {} - metadata = [] - url = result.get('primaryPaperLink', {}).get('url') if not url and result.get('links'): url = result.get('links')[0] @@ -60,22 +60,47 @@ def response(resp): if not url: url = paper_url + '/%s' % result['id'] - item['url'] = url + # publishedDate + if 'pubDate' in result: + publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d") + else: + publishedDate = None - item['title'] = result['title']['text'] - item['content'] = result['paperAbstract']['text'] + # authors + authors = [author[0]['name'] for author in result.get('authors', [])] - metadata = result.get('fieldsOfStudy') or [] - venue = result.get('venue', {}).get('text') - if venue: - metadata.append(venue) - if metadata: - item['metadata'] = ', '.join(metadata) + # pick for the first alternate link, but not from the crawler + pdf_url = None + for doc in result.get('alternatePaperLinks', []): + if doc['linkType'] != 'crawler': + pdf_url = doc['url'] + break - pubDate = result.get('pubDate') - if pubDate: - item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d") + # comments + comments = None + if 'citationStats' in result: + comments = gettext( + '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}' + ).format( + numCitations=result['citationStats']['numCitations'], + firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'], + lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'], + ) - results.append(item) + results.append( + { + 'template': 'paper.html', + 'url': url, + 'title': result['title']['text'], + 'content': result['paperAbstract']['text'], + 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'), + 'doi': result.get('doiInfo', {}).get('doi'), + 'tags': result.get('fieldsOfStudy'), + 'authors': authors, + 'pdf_url': pdf_url, + 'publishedDate': publishedDate, + 'comments': comments, + } + ) return results diff --git a/searx/engines/springer.py b/searx/engines/springer.py index 512d71e5e..2711fa807 100644 --- a/searx/engines/springer.py +++ b/searx/engines/springer.py @@ -19,7 +19,7 @@ about = { "results": 'JSON', } -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True nb_per_page = 10 api_key = 'unset' @@ -41,32 +41,30 @@ def response(resp): json_data = loads(resp.text) for record in json_data['records']: - content = record['abstract'][0:500] - if len(record['abstract']) > len(content): - content += "..." + content = record['abstract'] published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') - - metadata = [ - record[x] - for x in [ - 'publicationName', - 'identifier', - 'contentType', - ] - if record.get(x) is not None - ] - - metadata = ' / '.join(metadata) - if record.get('startingPage') and record.get('endingPage') is not None: - metadata += " (%(startingPage)s-%(endingPage)s)" % record - + authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']] + tags = record.get('genre') + if isinstance(tags, str): + tags = [tags] results.append( { + 'template': 'paper.html', 'title': record['title'], 'url': record['url'][0]['value'].replace('http://', 'https://', 1), + 'type': record.get('contentType'), 'content': content, 'publishedDate': published, - 'metadata': metadata, + 'authors': authors, + 'doi': record.get('doi'), + 'journal': record.get('publicationName'), + 'start_page': record.get('start_page'), + 'end_page': record.get('end_page'), + 'tags': tags, + 'issn': [record.get('issn')], + 'isbn': [record.get('isbn')], + 'volume': record.get('volume') or None, + 'number': record.get('number') or None, } ) return results diff --git a/searx/searxng.msg b/searx/searxng.msg index 3b876f96d..c37240f83 100644 --- a/searx/searxng.msg +++ b/searx/searxng.msg @@ -43,6 +43,7 @@ CATEGORY_GROUPS = { 'REPOS': 'repos', 'SOFTWARE_WIKIS': 'software wikis', 'WEB': 'web', + 'SCIENTIFIC PUBLICATIONS': 'scientific publications', } STYLE_NAMES = { diff --git a/searx/settings.yml b/searx/settings.yml index 3f07bb2dd..ba38e694a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -319,7 +319,6 @@ engines: - name: arxiv engine: arxiv shortcut: arx - categories: science timeout: 4.0 # tmp suspended: dh key too small @@ -411,23 +410,9 @@ engines: # api_key: 'unset' - name: crossref - engine: json_engine - paging: true - search_url: https://search.crossref.org/dois?q={query}&page={pageno} - url_query: doi - title_query: title - title_html_to_text: true - content_query: fullCitation - content_html_to_text: true - categories: science + engine: crossref shortcut: cr - about: - website: https://www.crossref.org/ - wikidata_id: Q5188229 - official_api_documentation: https://github.com/CrossRef/rest-api-doc - use_official_api: false - require_api_key: false - results: JSON + timeout: 10 - name: yep engine: json_engine @@ -1068,7 +1053,7 @@ engines: title_query: metadata/oaf:entity/oaf:result/title/$ content_query: metadata/oaf:entity/oaf:result/description/$ content_html_to_text: true - categories: science + categories: "science" shortcut: oad timeout: 5.0 about: @@ -1198,7 +1183,6 @@ engines: - name: pubmed engine: pubmed shortcut: pub - categories: science timeout: 3.0 - name: pypi @@ -1346,7 +1330,6 @@ engines: engine: semantic_scholar disabled: true shortcut: se - categories: science # Spotify needs API credentials # - name: spotify @@ -1372,8 +1355,7 @@ engines: # # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601" # api_key: 'unset' # shortcut: springer - # categories: science - # timeout: 6.0 + # timeout: 15.0 - name: startpage engine: startpage From fe43b6e8211c972700bc2602e8f16018c72bb269 Mon Sep 17 00:00:00 2001 From: Alexandre FLAMENT Date: Fri, 26 Aug 2022 17:25:45 +0000 Subject: [PATCH 5/7] [build] /static --- .../themes/simple/css/searxng-rtl.min.css | Bin 68358 -> 69227 bytes .../themes/simple/css/searxng-rtl.min.css.map | Bin 110492 -> 111870 bytes .../static/themes/simple/css/searxng.min.css | Bin 67261 -> 68129 bytes .../themes/simple/css/searxng.min.css.map | Bin 108805 -> 110183 bytes 4 files changed, 0 insertions(+), 0 deletions(-) diff --git a/searx/static/themes/simple/css/searxng-rtl.min.css b/searx/static/themes/simple/css/searxng-rtl.min.css index 450f5d96d9d178aa8c54d05c0db09b23c9b0e51b..1462d0d5e32f8deb93d33388701732479bd60972 100644 GIT binary patch delta 806 zcma))Jxc>I7{?JxT?-Y&LAXU|hs0~8h`GhDlDnk6U@jMu)I&?T&B@6FCqIPX;Og41 z;MB3=;vmlEdac?!T1(dF`91%a+}zjRA8L=YX{x9maswomR5=hEqnJN5RF4SLlH)66 zu+ORI$BIw|w8Q}u0|YzWo}$qhu3L8qyIe&86(`mCba^jda4cSo`mr!TvuldrnZ4mf zSPhr}CYA`!uX-!eKtMUKf-79du!xr7W4O`5ToCFrr40x|#z}@>4$r;F%1Ib8A?mpQ zrcf}12@682+qRJA-R*An)@r2fvhu*Xr^1XXuB(GlWCwNG(ju=#R*suLPhvvy-dr!S zMS?_OiF`$I_W8P>eVpt)&8E$xbZr(tP5L|zF3L)NKg~+(Tg(=PFY8R1;7riX_(~;( i|AewZXwa$aZnOTkjb+k`)pG*uWyh0)=SF?!tM&~xSqz-jO#R#O*;B{&zjqNx{WY)D)cV*lNiiwFLX6?1T&hFaT zUB_udi3+D84xkZCL_r9UIB=+d93n0V{s6=Q4jfPrZ~_Tc91vV6%*=XYw}>0|!RznG z``&xses6t0bo=L_k8YHAl$;ckQ6?)bBZQ;^${nO&&=R_CLpfQBmqJxQl-K(u!0+hvJ#Cm)po7MHLlUsRz51le&UHe_#wjzz_w0#0drzpM=%2{xK?&ZC;IbeiP$b611$b@`&tKCw=&V;Zzg zze=feO){vn3aL9qJj2#d~V>7P+O;aFUUg6fUrcTE0;VKHDPiRJMb0 zTfX1{6>-ESr>ma^+iXLg{AKKP$gOHPm~>gFXKB4dCbi!S)+TEdo)+-NCSxG>qwv>2 ztU^)eyFN!|!&aZ;9FO-nu5U5mgiOBEv*hjC_IPrONfoFHj%>+!X_>?y3hg~!{|rcW z!sn$Rwlg6~!48kkoKp?{D>ka&nFN{A>*xP3isW1Ui~nNwd2<#>$s}Vf_!O|7l2^nr zJuT(A=>^qD^|MZjJZL;Q7Ej4-**7$mBG-)7)p`cy4f)PHWS2B>eTDw@iAyV@rEKrA zgz!8P=cLX?IJ$7Sh&yNGScd#$oINs|d?{@1hfVN)Xvfx^j^RezxUJc6d)0z)8oREo zs~uN!ya0q%-Dw;6avHi=HM9j@1)psH{B0Cz(~ z?N(RR(WgRSlDu#*Nk5_B*rE@>@%iV@!O@Cl7>DpY@9L)3p8+9x{@^(MV-bwguSUV} z7~^K8Yr5^Y(Mm%%=;##y$=`pDE%06lP2Gxi^{U%Q!+6YN?j^4J5+D>1mfH{j)zdRm zG;jr+B8%_cN3T5wj?q8>j0{g#T9&I>ENCy)C9PfOn|3d&%Dws*l%b3kuTao@CRzvGZxSJUeaH_cuFqssVu gra|~`m_g%w9Qtt$JaP}KrmuYf9@u~VCb$>;13dlrxBvhE delta 773 zcmb_aF>ljA6qXq>AYxR}3MzF(T{slSjXNUU1&6wpDN!3zhgQ|uO`SOBIEfRt0x=;K zm601Vl#LxBd5D-0Li`3K7FM!WVgzw_wpGN;!*gRbu1y*X~yfXxvL8W3O}>G-&r`1@y4@x5=8E{+-(s4xSH+Q;BLd2HDXK%r<3NxL!8_-vW&*V~wHdCT491iG>_6P0h zh&6~s*0IWC;Y_$SAR`cwen@yKyTEWnrK6u4ClMnPofcLirmDfmn!G9B&Lr2()-$=( z?Q=ZSJ(yfM`sq&|=|8LpD&@(|HUGZr*a@zp|=EuLj%KOBx!We0RY*CBa^y oTZQ!q(cAsXmGat)R)5y&OK+E^w_Zw1vmd9@$2+r&M7k#Z0r7(Fh5!Hn diff --git a/searx/static/themes/simple/css/searxng.min.css b/searx/static/themes/simple/css/searxng.min.css index 09d26b534d5bf5b07d88ada94eb61c147bcb9184..90820978fda154ec09abd7db111fd7c3f4432231 100644 GIT binary patch delta 796 zcma)4&q~8E97Y@pUK}zJyx5Dd9a6T5f+>QJ;47p}wg%HQB$-9a=o5H~;Mwyof@eV= zLIm|NkDjJ)V7q_1mQBg^%lG^K-Cve&u1dFW{X`PgVHyY|h_q~^wWMxG6BT2s1ViUg zBbO1!4J9TLD1ki6+t6+__azCs_Pp;Frf0JhK!#BfpG@v$AFPy~$E}cSpy&mG_F;O% z36N}44sJ-q77NCO^WtVmqowKNS#p1cbdIv5IqEh?1;o==%+JXMT%SUYxtR8d5+ z_F<#`XAmPAuKghit~P7tmo1+QlQFP8s|06%BpA25eVa~ zx4ILhMS@5o0bNOuIU1~*!AseE?p4hC`{vzOe|a~Vo5{3MmxbPGUeV8ZX4?M;c2xYN s)&t5Jlx*Xv*#VmoOLopsj1SjI#J+_+ZMS$lET*~+{&u@=B-`>h!NhigS1QW722O$(QFabrCo#vnn#drrf&%TWW%RIl6nPaCc ztKro-mebyylY(M^ekUn0%!(~ach)VRofpe9P)y`)te${rg=fg3Lpz%k+gyO#5Pv%3 zyB_F%&+R9(&K%6K(>zm?W+$5fZ|1+~ujHUuYvQXtR)>5l|RPQh2U@{ zpA->~Kq{Ei$jHcVeO_1xUQ;<`r6Ledn#wJENYm9|AR6_<3o>8*9>#V`;gC-&X(dn#;5>!?Mz-zKri^zQAPL z6-CEh>4=(OaYxkB3T28)_^~GA+ZR{gimlMuI86oxR-}|Thr>q(w_y1P&}Bt2(0oIU zC@qQ1C^~!dJ4>TT6bH2OD?}XqetNszOHqSXwY>ugLZgOE#wG~lt`{66@=Y)_?bfqu_Uvh* z28*h$@1hWdre>%%iA(dY2aI+zZUU#l5V<-A21)fE=)tuY-6=mUwuuG7(9B?4D>WCV zgmB1KZ9|t``L@O>v$+^7E^2z|Sj4~6>&h4yCO^Iodhv^|4gf#-brTE_@f>)qZ@gGF zo2uC)56*z0(at~)Erkt(!VO`Syj%l^1`K4CHFHV`jq_X~B!mcRQ|W&;!O1Q<*3+wC zc>CZb@FYM^-2(f1oqHhPY=BYn{W|Ew#-qMB+`*{lH_m}U7W28WJ!b#C0gk-6YepNi x-MZ5PqsN#F|H5D40zo%{?*;-ry`gb-?S69DLruAK0qi5$+n{IrZF1MJd;cja@moSr^m}G=OsHrO!rPK1Tw*naZSnsGQl?aCgvt@z8=rU#8;H= z6&9Wv-C(l+Z$#L?7dLXep$H^C} zBdGu+4)soP&g47E({ff~WadM6qb16>db4jxCC4czKFz0xK z;tFb|5lGa?C8-QVCxWd`OgEVTc)#PV6juPTHbcGJVpop?KTaJSG2cB JuVv(61OStm-@O0; From 08b88597052dfdf17e947289d79510fdadad51e3 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 18 Sep 2022 14:52:54 +0200 Subject: [PATCH 6/7] [doc] paper.html result template Signed-off-by: Markus Heiser --- docs/dev/engine_overview.rst | 89 ++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/docs/dev/engine_overview.rst b/docs/dev/engine_overview.rst index e950ae667..731e2f86a 100644 --- a/docs/dev/engine_overview.rst +++ b/docs/dev/engine_overview.rst @@ -311,3 +311,92 @@ the parameter ``template`` must be set to the desired type. address.postcode postcode of object address.country country of object ========================= ===================================================== + +.. _BibTeX format: https://www.bibtex.com/g/bibtex-format/ +.. _BibTeX field types: https://en.wikipedia.org/wiki/BibTeX#Field_types + +.. list-table:: Parameter of the **paper** media type / + see `BibTeX field types`_ and `BibTeX format`_ + :header-rows: 2 + :width: 100% + + * - result-parameter + - Python type + - information + + * - template + - :py:class:`str` + - is set to ``paper.html`` + + * - title + - :py:class:`str` + - title of the result + + * - content + - :py:class:`str` + - abstract + + * - comments + - :py:class:`str` + - free text display in italic below the content + + * - tags + - :py:class:`List `\ [\ :py:class:`str`\ ] + - free tag list + + * - publishedDate + - :py:class:`datetime ` + - last publication date + + * - authors + - :py:class:`List `\ [\ :py:class:`str`\ ] + - list of authors of the work (authors with a "s") + + * - editor + - :py:class:`str` + - list of editors of a book + + * - publisher + - :py:class:`str` + - name of the publisher + + * - journal + - :py:class:`str` + - name of the journal or magazine the article was + published in + + * - volume + - :py:class:`str` + - volume number + + * - start_page + - :py:class:`int` + - page number where the article starts + + * - end_page + - :py:class:`int` + - page number where the article ends + + * - number + - :py:class:`str` + - number of the report or the issue number for a journal article + + * - doi + - :py:class:`str` + - DOI number (like ``10.1038/d41586-018-07848-2``) + + * - issn + - :py:class:`str` + - ISSN number like ``1476-4687`` + + * - isbn + - :py:class:`str` + - ISBN number like ``9780201896831`` + + * - pdf_url + - :py:class:`str` + - URL to the full article, the PDF version + + * - html_url + - :py:class:`str` + - URL to full article, HTML version From d6446be38f3f858c09887a89c8fc490a3c300b95 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 23 Sep 2022 19:58:14 +0200 Subject: [PATCH 7/7] [mod] science category: various update of about PR 1705 --- docs/dev/engine_overview.rst | 10 +++------- searx/engines/crossref.py | 4 ++-- searx/engines/semantic_scholar.py | 3 +-- searx/engines/springer.py | 3 +-- searx/settings.yml | 3 ++- searx/templates/simple/result_templates/paper.html | 4 ++-- searx/webapp.py | 4 ++-- searx/webutils.py | 8 +++++++- 8 files changed, 20 insertions(+), 19 deletions(-) diff --git a/docs/dev/engine_overview.rst b/docs/dev/engine_overview.rst index 731e2f86a..7d94b83f1 100644 --- a/docs/dev/engine_overview.rst +++ b/docs/dev/engine_overview.rst @@ -369,13 +369,9 @@ the parameter ``template`` must be set to the desired type. - :py:class:`str` - volume number - * - start_page - - :py:class:`int` - - page number where the article starts - - * - end_page - - :py:class:`int` - - page number where the article ends + * - pages + - :py:class:`str` + - page range where the article is * - number - :py:class:`str` diff --git a/searx/engines/crossref.py b/searx/engines/crossref.py index d61318146..fbe2f0c2a 100644 --- a/searx/engines/crossref.py +++ b/searx/engines/crossref.py @@ -33,10 +33,10 @@ def response(resp): if record_type == 'book-chapter': title = record['container-title'][0] if record['title'][0].lower().strip() != title.lower().strip(): - title = title + ' (' + record['title'][0] + ')' + title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')' journal = None else: - title = record['title'][0] + title = html_to_text(record['title'][0]) journal = record.get('container-title', [None])[0] url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL'] authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])] diff --git a/searx/engines/semantic_scholar.py b/searx/engines/semantic_scholar.py index b2701c333..7a1b5b231 100644 --- a/searx/engines/semantic_scholar.py +++ b/searx/engines/semantic_scholar.py @@ -48,7 +48,6 @@ def request(query, params): def response(resp): res = loads(resp.text) results = [] - for result in res['results']: url = result.get('primaryPaperLink', {}).get('url') if not url and result.get('links'): @@ -72,7 +71,7 @@ def response(resp): # pick for the first alternate link, but not from the crawler pdf_url = None for doc in result.get('alternatePaperLinks', []): - if doc['linkType'] != 'crawler': + if doc['linkType'] not in ('crawler', 'doi'): pdf_url = doc['url'] break diff --git a/searx/engines/springer.py b/searx/engines/springer.py index 2711fa807..e5255b794 100644 --- a/searx/engines/springer.py +++ b/searx/engines/springer.py @@ -58,8 +58,7 @@ def response(resp): 'authors': authors, 'doi': record.get('doi'), 'journal': record.get('publicationName'), - 'start_page': record.get('start_page'), - 'end_page': record.get('end_page'), + 'pages': record.get('start_page') + '-' + record.get('end_page'), 'tags': tags, 'issn': [record.get('issn')], 'isbn': [record.get('isbn')], diff --git a/searx/settings.yml b/searx/settings.yml index ba38e694a..9e9b2f9e6 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -412,7 +412,8 @@ engines: - name: crossref engine: crossref shortcut: cr - timeout: 10 + timeout: 30 + disable: true - name: yep engine: json_engine diff --git a/searx/templates/simple/result_templates/paper.html b/searx/templates/simple/result_templates/paper.html index 3ede1b250..54704c866 100644 --- a/searx/templates/simple/result_templates/paper.html +++ b/searx/templates/simple/result_templates/paper.html @@ -13,8 +13,8 @@ .{{- result.number -}} {%- endif -%} {%- endif -%} - {%- if result.start_page -%} -  {{- result.start_page -}} / {{- result.end_page -}} + {%- if result.pages -%} +  {{- result.pages -}} {%- endif -%} diff --git a/searx/webapp.py b/searx/webapp.py index e6bda42be..44500911a 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -77,7 +77,7 @@ from searx.webutils import ( is_hmac_of, is_flask_run_cmdline, group_engines_in_tab, - searxng_format_date, + searxng_l10n_timespan, ) from searx.webadapter import ( get_search_query_from_webapp, @@ -723,7 +723,7 @@ def search(): except ValueError: result['publishedDate'] = None else: - result['publishedDate'] = searxng_format_date(result['publishedDate']) + result['publishedDate'] = searxng_l10n_timespan(result['publishedDate']) # set result['open_group'] = True when the template changes from the previous result # set result['close_group'] = True when the template changes on the next result diff --git a/searx/webutils.py b/searx/webutils.py index f084fe9d3..a5ed27c2c 100644 --- a/searx/webutils.py +++ b/searx/webutils.py @@ -141,7 +141,13 @@ def highlight_content(content, query): return content -def searxng_format_date(dt: datetime): # pylint: disable=invalid-name +def searxng_l10n_timespan(dt: datetime) -> str: # pylint: disable=invalid-name + """Returns a human-readable and translated string indicating how long ago + a date was in the past / the time span of the date to the present. + + On January 1st, midnight, the returned string only indicates how many years + ago the date was. + """ # TODO, check if timezone is calculated right # pylint: disable=fixme d = dt.date() t = dt.time()