Science category: update the engines
* use the paper.html template * fetch more data from the engines * add crossref.py
This commit is contained in:
		
							parent
							
								
									593026ad9c
								
							
						
					
					
						commit
						e36f85b836
					
				| @ -3,9 +3,10 @@ | |||||||
|  ArXiV (Scientific preprints) |  ArXiV (Scientific preprints) | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import etree | ||||||
|  | from lxml.etree import XPath | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from searx.utils import eval_xpath_list, eval_xpath_getindex | from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -17,7 +18,7 @@ about = { | |||||||
|     "results": 'XML-RSS', |     "results": 'XML-RSS', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| categories = ['science'] | categories = ['science', 'scientific publications'] | ||||||
| paging = True | paging = True | ||||||
| 
 | 
 | ||||||
| base_url = ( | base_url = ( | ||||||
| @ -27,6 +28,23 @@ base_url = ( | |||||||
| # engine dependent config | # engine dependent config | ||||||
| number_of_results = 10 | number_of_results = 10 | ||||||
| 
 | 
 | ||||||
|  | # xpaths | ||||||
|  | arxiv_namespaces = { | ||||||
|  |     "atom": "http://www.w3.org/2005/Atom", | ||||||
|  |     "arxiv": "http://arxiv.org/schemas/atom", | ||||||
|  | } | ||||||
|  | xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces) | ||||||
|  | xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces) | ||||||
|  | xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces) | ||||||
|  | xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces) | ||||||
|  | xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces) | ||||||
|  | xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces) | ||||||
|  | xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces) | ||||||
|  | xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces) | ||||||
|  | xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces) | ||||||
|  | xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces) | ||||||
|  | xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     # basic search |     # basic search | ||||||
| @ -41,30 +59,50 @@ def request(query, params): | |||||||
| 
 | 
 | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     results = [] | ||||||
|  |     dom = etree.fromstring(resp.content) | ||||||
|  |     for entry in eval_xpath_list(dom, xpath_entry): | ||||||
|  |         title = eval_xpath_getindex(entry, xpath_title, 0).text | ||||||
| 
 | 
 | ||||||
|     dom = html.fromstring(resp.content) |         url = eval_xpath_getindex(entry, xpath_id, 0).text | ||||||
|  |         abstract = eval_xpath_getindex(entry, xpath_summary, 0).text | ||||||
| 
 | 
 | ||||||
|     for entry in eval_xpath_list(dom, '//entry'): |         authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)] | ||||||
|         title = eval_xpath_getindex(entry, './/title', 0).text |  | ||||||
| 
 | 
 | ||||||
|         url = eval_xpath_getindex(entry, './/id', 0).text |         #  doi | ||||||
|  |         doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None) | ||||||
|  |         doi = None if doi_element is None else doi_element.text | ||||||
| 
 | 
 | ||||||
|         content_string = '{doi_content}{abstract_content}' |         # pdf | ||||||
|  |         pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None) | ||||||
|  |         pdf_url = None if pdf_element is None else pdf_element.attrib.get('href') | ||||||
| 
 | 
 | ||||||
|         abstract = eval_xpath_getindex(entry, './/summary', 0).text |         # journal | ||||||
|  |         journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None) | ||||||
|  |         journal = None if journal_element is None else journal_element.text | ||||||
| 
 | 
 | ||||||
|         #  If a doi is available, add it to the snipppet |         # tags | ||||||
|         doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None) |         tag_elements = eval_xpath(entry, xpath_category) | ||||||
|         doi_content = doi_element.text if doi_element is not None else '' |         tags = [str(tag) for tag in tag_elements] | ||||||
|         content = content_string.format(doi_content=doi_content, abstract_content=abstract) |  | ||||||
| 
 | 
 | ||||||
|         if len(content) > 300: |         # comments | ||||||
|             content = content[0:300] + "..." |         comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None) | ||||||
|         # TODO: center snippet on query term |         comments = None if comments_elements is None else comments_elements.text | ||||||
| 
 | 
 | ||||||
|         publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ') |         publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ') | ||||||
| 
 | 
 | ||||||
|         res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content} |         res_dict = { | ||||||
|  |             'template': 'paper.html', | ||||||
|  |             'url': url, | ||||||
|  |             'title': title, | ||||||
|  |             'publishedDate': publishedDate, | ||||||
|  |             'content': abstract, | ||||||
|  |             'doi': doi, | ||||||
|  |             'authors': authors, | ||||||
|  |             'journal': journal, | ||||||
|  |             'tags': tags, | ||||||
|  |             'comments': comments, | ||||||
|  |             'pdf_url': pdf_url, | ||||||
|  |         } | ||||||
| 
 | 
 | ||||||
|         results.append(res_dict) |         results.append(res_dict) | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										59
									
								
								searx/engines/crossref.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								searx/engines/crossref.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,59 @@ | |||||||
|  | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
|  | # lint: pylint | ||||||
|  | """Semantic Scholar (Science) | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | from urllib.parse import urlencode | ||||||
|  | from searx.utils import html_to_text | ||||||
|  | 
 | ||||||
|  | about = { | ||||||
|  |     "website": 'https://www.crossref.org/', | ||||||
|  |     "wikidata_id": 'Q5188229', | ||||||
|  |     "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc', | ||||||
|  |     "use_official_api": False, | ||||||
|  |     "require_api_key": False, | ||||||
|  |     "results": 'JSON', | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | categories = ['science', 'scientific publications'] | ||||||
|  | paging = True | ||||||
|  | search_url = 'https://api.crossref.org/works' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def request(query, params): | ||||||
|  |     params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1))) | ||||||
|  |     return params | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def response(resp): | ||||||
|  |     res = resp.json() | ||||||
|  |     results = [] | ||||||
|  |     for record in res['message']['items']: | ||||||
|  |         record_type = record['type'] | ||||||
|  |         if record_type == 'book-chapter': | ||||||
|  |             title = record['container-title'][0] | ||||||
|  |             if record['title'][0].lower().strip() != title.lower().strip(): | ||||||
|  |                 title = title + ' (' + record['title'][0] + ')' | ||||||
|  |             journal = None | ||||||
|  |         else: | ||||||
|  |             title = record['title'][0] | ||||||
|  |             journal = record.get('container-title', [None])[0] | ||||||
|  |         url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL'] | ||||||
|  |         authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])] | ||||||
|  |         isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])] | ||||||
|  |         results.append( | ||||||
|  |             { | ||||||
|  |                 'template': 'paper.html', | ||||||
|  |                 'url': url, | ||||||
|  |                 'title': title, | ||||||
|  |                 'journal': journal, | ||||||
|  |                 'volume': record.get('volume'), | ||||||
|  |                 'type': record['type'], | ||||||
|  |                 'content': html_to_text(record.get('abstract', '')), | ||||||
|  |                 'publisher': record.get('publisher'), | ||||||
|  |                 'authors': authors, | ||||||
|  |                 'doi': record['DOI'], | ||||||
|  |                 'isbn': isbn, | ||||||
|  |             } | ||||||
|  |         ) | ||||||
|  |     return results | ||||||
| @ -13,10 +13,12 @@ Definitions`_. | |||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
|  | from typing import Optional | ||||||
| from lxml import html | from lxml import html | ||||||
| 
 | 
 | ||||||
| from searx.utils import ( | from searx.utils import ( | ||||||
|     eval_xpath, |     eval_xpath, | ||||||
|  |     eval_xpath_getindex, | ||||||
|     eval_xpath_list, |     eval_xpath_list, | ||||||
|     extract_text, |     extract_text, | ||||||
| ) | ) | ||||||
| @ -46,7 +48,7 @@ about = { | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['science'] | categories = ['science', 'scientific publications'] | ||||||
| paging = True | paging = True | ||||||
| language_support = True | language_support = True | ||||||
| use_locale_domain = True | use_locale_domain = True | ||||||
| @ -99,7 +101,43 @@ def request(query, params): | |||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def response(resp): | def parse_gs_a(text: Optional[str]): | ||||||
|  |     """Parse the text written in green. | ||||||
|  | 
 | ||||||
|  |     Possible formats: | ||||||
|  |     * "{authors} - {journal}, {year} - {publisher}" | ||||||
|  |     * "{authors} - {year} - {publisher}" | ||||||
|  |     * "{authors} - {publisher}" | ||||||
|  |     """ | ||||||
|  |     if text is None or text == "": | ||||||
|  |         return None, None, None, None | ||||||
|  | 
 | ||||||
|  |     s_text = text.split(' - ') | ||||||
|  |     authors = s_text[0].split(', ') | ||||||
|  |     publisher = s_text[-1] | ||||||
|  |     if len(s_text) != 3: | ||||||
|  |         return authors, None, publisher, None | ||||||
|  | 
 | ||||||
|  |     # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}" | ||||||
|  |     # get journal and year | ||||||
|  |     journal_year = s_text[1].split(', ') | ||||||
|  |     # journal is optional and may contains some coma | ||||||
|  |     if len(journal_year) > 1: | ||||||
|  |         journal = ', '.join(journal_year[0:-1]) | ||||||
|  |         if journal == '…': | ||||||
|  |             journal = None | ||||||
|  |     else: | ||||||
|  |         journal = None | ||||||
|  |     # year | ||||||
|  |     year = journal_year[-1] | ||||||
|  |     try: | ||||||
|  |         publishedDate = datetime.strptime(year.strip(), '%Y') | ||||||
|  |     except ValueError: | ||||||
|  |         publishedDate = None | ||||||
|  |     return authors, journal, publisher, publishedDate | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def response(resp):  # pylint: disable=too-many-locals | ||||||
|     """Get response from google's search request""" |     """Get response from google's search request""" | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
| @ -112,30 +150,53 @@ def response(resp): | |||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): |     for result in eval_xpath_list(dom, '//div[@data-cid]'): | ||||||
| 
 | 
 | ||||||
|         title = extract_text(eval_xpath(result, './h3[1]//a')) |         title = extract_text(eval_xpath(result, './/h3[1]//a')) | ||||||
| 
 | 
 | ||||||
|         if not title: |         if not title: | ||||||
|             # this is a [ZITATION] block |             # this is a [ZITATION] block | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         url = eval_xpath(result, './h3[1]//a/@href')[0] |  | ||||||
|         content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or '' |  | ||||||
| 
 |  | ||||||
|         pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]')) |  | ||||||
|         if pub_info: |  | ||||||
|             content += "[%s]" % pub_info |  | ||||||
| 
 |  | ||||||
|         pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) |         pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) | ||||||
|         if pub_type: |         if pub_type: | ||||||
|             title = title + " " + pub_type |             pub_type = pub_type[1:-1].lower() | ||||||
|  | 
 | ||||||
|  |         url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0) | ||||||
|  |         content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]')) | ||||||
|  |         authors, journal, publisher, publishedDate = parse_gs_a( | ||||||
|  |             extract_text(eval_xpath(result, './/div[@class="gs_a"]')) | ||||||
|  |         ) | ||||||
|  |         if publisher in url: | ||||||
|  |             publisher = None | ||||||
|  | 
 | ||||||
|  |         # cited by | ||||||
|  |         comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]')) | ||||||
|  | 
 | ||||||
|  |         # link to the html or pdf document | ||||||
|  |         html_url = None | ||||||
|  |         pdf_url = None | ||||||
|  |         doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None) | ||||||
|  |         doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) | ||||||
|  |         if doc_type == "[PDF]": | ||||||
|  |             pdf_url = doc_url | ||||||
|  |         else: | ||||||
|  |             html_url = doc_url | ||||||
| 
 | 
 | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|  |                 'template': 'paper.html', | ||||||
|  |                 'type': pub_type, | ||||||
|                 'url': url, |                 'url': url, | ||||||
|                 'title': title, |                 'title': title, | ||||||
|  |                 'authors': authors, | ||||||
|  |                 'publisher': publisher, | ||||||
|  |                 'journal': journal, | ||||||
|  |                 'publishedDate': publishedDate, | ||||||
|                 'content': content, |                 'content': content, | ||||||
|  |                 'comments': comments, | ||||||
|  |                 'html_url': html_url, | ||||||
|  |                 'pdf_url': pdf_url, | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -3,11 +3,15 @@ | |||||||
|  PubMed (Scholar publications) |  PubMed (Scholar publications) | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from flask_babel import gettext |  | ||||||
| from lxml import etree | from lxml import etree | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from searx.network import get | from searx.network import get | ||||||
|  | from searx.utils import ( | ||||||
|  |     eval_xpath_getindex, | ||||||
|  |     eval_xpath_list, | ||||||
|  |     extract_text, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
| @ -22,7 +26,7 @@ about = { | |||||||
|     "results": 'XML', |     "results": 'XML', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| categories = ['science'] | categories = ['science', 'scientific publications'] | ||||||
| 
 | 
 | ||||||
| base_url = ( | base_url = ( | ||||||
|     'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' |     'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' | ||||||
| @ -63,45 +67,60 @@ def response(resp): | |||||||
| 
 | 
 | ||||||
|     retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) |     retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) | ||||||
| 
 | 
 | ||||||
|     search_results_xml = get(retrieve_url_encoded).content |     search_results_response = get(retrieve_url_encoded).content | ||||||
|     search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') |     search_results = etree.XML(search_results_response) | ||||||
|  |     for entry in eval_xpath_list(search_results, '//PubmedArticle'): | ||||||
|  |         medline = eval_xpath_getindex(entry, './MedlineCitation', 0) | ||||||
| 
 | 
 | ||||||
|     for entry in search_results: |         title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text | ||||||
|         title = entry.xpath('.//Article/ArticleTitle')[0].text |         pmid = eval_xpath_getindex(medline, './/PMID', 0).text | ||||||
| 
 |  | ||||||
|         pmid = entry.xpath('.//PMID')[0].text |  | ||||||
|         url = pubmed_url + pmid |         url = pubmed_url + pmid | ||||||
|  |         content = extract_text( | ||||||
|  |             eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True | ||||||
|  |         ) | ||||||
|  |         doi = extract_text( | ||||||
|  |             eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True | ||||||
|  |         ) | ||||||
|  |         journal = extract_text( | ||||||
|  |             eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True | ||||||
|  |         ) | ||||||
|  |         issn = extract_text( | ||||||
|  |             eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True | ||||||
|  |         ) | ||||||
|  |         authors = [] | ||||||
|  |         for author in eval_xpath_list(medline, './Article/AuthorList/Author'): | ||||||
|  |             f = eval_xpath_getindex(author, './ForeName', 0, default=None) | ||||||
|  |             l = eval_xpath_getindex(author, './LastName', 0, default=None) | ||||||
|  |             f = '' if f is None else f.text | ||||||
|  |             l = '' if l is None else l.text | ||||||
|  |             authors.append((f + ' ' + l).strip()) | ||||||
| 
 | 
 | ||||||
|         try: |         res_dict = { | ||||||
|             content = entry.xpath('.//Abstract/AbstractText')[0].text |             'template': 'paper.html', | ||||||
|         except: |             'url': url, | ||||||
|             content = gettext('No abstract is available for this publication.') |             'title': title, | ||||||
| 
 |             'content': content, | ||||||
|         #  If a doi is available, add it to the snipppet |             'journal': journal, | ||||||
|         try: |             'issn': [issn], | ||||||
|             doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text |             'authors': authors, | ||||||
|             content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) |             'doi': doi, | ||||||
|         except: |         } | ||||||
|             pass |  | ||||||
| 
 |  | ||||||
|         if len(content) > 300: |  | ||||||
|             content = content[0:300] + "..." |  | ||||||
|         # TODO: center snippet on query term |  | ||||||
| 
 |  | ||||||
|         res_dict = {'url': url, 'title': title, 'content': content} |  | ||||||
| 
 | 
 | ||||||
|  |         accepted_date = eval_xpath_getindex( | ||||||
|  |             entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None | ||||||
|  |         ) | ||||||
|  |         if accepted_date is not None: | ||||||
|  |             year = eval_xpath_getindex(accepted_date, './Year', 0) | ||||||
|  |             month = eval_xpath_getindex(accepted_date, './Month', 0) | ||||||
|  |             day = eval_xpath_getindex(accepted_date, './Day', 0) | ||||||
|             try: |             try: | ||||||
|                 publishedDate = datetime.strptime( |                 publishedDate = datetime.strptime( | ||||||
|                 entry.xpath('.//DateCreated/Year')[0].text |                     year.text + '-' + month.text + '-' + day.text, | ||||||
|                 + '-' |  | ||||||
|                 + entry.xpath('.//DateCreated/Month')[0].text |  | ||||||
|                 + '-' |  | ||||||
|                 + entry.xpath('.//DateCreated/Day')[0].text, |  | ||||||
|                     '%Y-%m-%d', |                     '%Y-%m-%d', | ||||||
|                 ) |                 ) | ||||||
|                 res_dict['publishedDate'] = publishedDate |                 res_dict['publishedDate'] = publishedDate | ||||||
|         except: |             except Exception as e: | ||||||
|             pass |                 print(e) | ||||||
| 
 | 
 | ||||||
|         results.append(res_dict) |         results.append(res_dict) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -6,6 +6,8 @@ | |||||||
| from json import dumps, loads | from json import dumps, loads | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| 
 | 
 | ||||||
|  | from flask_babel import gettext | ||||||
|  | 
 | ||||||
| about = { | about = { | ||||||
|     "website": 'https://www.semanticscholar.org/', |     "website": 'https://www.semanticscholar.org/', | ||||||
|     "wikidata_id": 'Q22908627', |     "wikidata_id": 'Q22908627', | ||||||
| @ -15,6 +17,7 @@ about = { | |||||||
|     "results": 'JSON', |     "results": 'JSON', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | categories = ['science', 'scientific publications'] | ||||||
| paging = True | paging = True | ||||||
| search_url = 'https://www.semanticscholar.org/api/1/search' | search_url = 'https://www.semanticscholar.org/api/1/search' | ||||||
| paper_url = 'https://www.semanticscholar.org/paper' | paper_url = 'https://www.semanticscholar.org/paper' | ||||||
| @ -47,9 +50,6 @@ def response(resp): | |||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     for result in res['results']: |     for result in res['results']: | ||||||
|         item = {} |  | ||||||
|         metadata = [] |  | ||||||
| 
 |  | ||||||
|         url = result.get('primaryPaperLink', {}).get('url') |         url = result.get('primaryPaperLink', {}).get('url') | ||||||
|         if not url and result.get('links'): |         if not url and result.get('links'): | ||||||
|             url = result.get('links')[0] |             url = result.get('links')[0] | ||||||
| @ -60,22 +60,47 @@ def response(resp): | |||||||
|         if not url: |         if not url: | ||||||
|             url = paper_url + '/%s' % result['id'] |             url = paper_url + '/%s' % result['id'] | ||||||
| 
 | 
 | ||||||
|         item['url'] = url |         # publishedDate | ||||||
|  |         if 'pubDate' in result: | ||||||
|  |             publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d") | ||||||
|  |         else: | ||||||
|  |             publishedDate = None | ||||||
| 
 | 
 | ||||||
|         item['title'] = result['title']['text'] |         # authors | ||||||
|         item['content'] = result['paperAbstract']['text'] |         authors = [author[0]['name'] for author in result.get('authors', [])] | ||||||
| 
 | 
 | ||||||
|         metadata = result.get('fieldsOfStudy') or [] |         # pick for the first alternate link, but not from the crawler | ||||||
|         venue = result.get('venue', {}).get('text') |         pdf_url = None | ||||||
|         if venue: |         for doc in result.get('alternatePaperLinks', []): | ||||||
|             metadata.append(venue) |             if doc['linkType'] != 'crawler': | ||||||
|         if metadata: |                 pdf_url = doc['url'] | ||||||
|             item['metadata'] = ', '.join(metadata) |                 break | ||||||
| 
 | 
 | ||||||
|         pubDate = result.get('pubDate') |         # comments | ||||||
|         if pubDate: |         comments = None | ||||||
|             item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d") |         if 'citationStats' in result: | ||||||
|  |             comments = gettext( | ||||||
|  |                 '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}' | ||||||
|  |             ).format( | ||||||
|  |                 numCitations=result['citationStats']['numCitations'], | ||||||
|  |                 firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'], | ||||||
|  |                 lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'], | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|         results.append(item) |         results.append( | ||||||
|  |             { | ||||||
|  |                 'template': 'paper.html', | ||||||
|  |                 'url': url, | ||||||
|  |                 'title': result['title']['text'], | ||||||
|  |                 'content': result['paperAbstract']['text'], | ||||||
|  |                 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'), | ||||||
|  |                 'doi': result.get('doiInfo', {}).get('doi'), | ||||||
|  |                 'tags': result.get('fieldsOfStudy'), | ||||||
|  |                 'authors': authors, | ||||||
|  |                 'pdf_url': pdf_url, | ||||||
|  |                 'publishedDate': publishedDate, | ||||||
|  |                 'comments': comments, | ||||||
|  |             } | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ about = { | |||||||
|     "results": 'JSON', |     "results": 'JSON', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| categories = ['science'] | categories = ['science', 'scientific publications'] | ||||||
| paging = True | paging = True | ||||||
| nb_per_page = 10 | nb_per_page = 10 | ||||||
| api_key = 'unset' | api_key = 'unset' | ||||||
| @ -41,32 +41,30 @@ def response(resp): | |||||||
|     json_data = loads(resp.text) |     json_data = loads(resp.text) | ||||||
| 
 | 
 | ||||||
|     for record in json_data['records']: |     for record in json_data['records']: | ||||||
|         content = record['abstract'][0:500] |         content = record['abstract'] | ||||||
|         if len(record['abstract']) > len(content): |  | ||||||
|             content += "..." |  | ||||||
|         published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') |         published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') | ||||||
| 
 |         authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']] | ||||||
|         metadata = [ |         tags = record.get('genre') | ||||||
|             record[x] |         if isinstance(tags, str): | ||||||
|             for x in [ |             tags = [tags] | ||||||
|                 'publicationName', |  | ||||||
|                 'identifier', |  | ||||||
|                 'contentType', |  | ||||||
|             ] |  | ||||||
|             if record.get(x) is not None |  | ||||||
|         ] |  | ||||||
| 
 |  | ||||||
|         metadata = ' / '.join(metadata) |  | ||||||
|         if record.get('startingPage') and record.get('endingPage') is not None: |  | ||||||
|             metadata += " (%(startingPage)s-%(endingPage)s)" % record |  | ||||||
| 
 |  | ||||||
|         results.append( |         results.append( | ||||||
|             { |             { | ||||||
|  |                 'template': 'paper.html', | ||||||
|                 'title': record['title'], |                 'title': record['title'], | ||||||
|                 'url': record['url'][0]['value'].replace('http://', 'https://', 1), |                 'url': record['url'][0]['value'].replace('http://', 'https://', 1), | ||||||
|  |                 'type': record.get('contentType'), | ||||||
|                 'content': content, |                 'content': content, | ||||||
|                 'publishedDate': published, |                 'publishedDate': published, | ||||||
|                 'metadata': metadata, |                 'authors': authors, | ||||||
|  |                 'doi': record.get('doi'), | ||||||
|  |                 'journal': record.get('publicationName'), | ||||||
|  |                 'start_page': record.get('start_page'), | ||||||
|  |                 'end_page': record.get('end_page'), | ||||||
|  |                 'tags': tags, | ||||||
|  |                 'issn': [record.get('issn')], | ||||||
|  |                 'isbn': [record.get('isbn')], | ||||||
|  |                 'volume': record.get('volume') or None, | ||||||
|  |                 'number': record.get('number') or None, | ||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
|     return results |     return results | ||||||
|  | |||||||
| @ -43,6 +43,7 @@ CATEGORY_GROUPS = { | |||||||
|     'REPOS': 'repos', |     'REPOS': 'repos', | ||||||
|     'SOFTWARE_WIKIS': 'software wikis', |     'SOFTWARE_WIKIS': 'software wikis', | ||||||
|     'WEB': 'web', |     'WEB': 'web', | ||||||
|  |     'SCIENTIFIC PUBLICATIONS': 'scientific publications', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| STYLE_NAMES = { | STYLE_NAMES = { | ||||||
|  | |||||||
| @ -319,7 +319,6 @@ engines: | |||||||
|   - name: arxiv |   - name: arxiv | ||||||
|     engine: arxiv |     engine: arxiv | ||||||
|     shortcut: arx |     shortcut: arx | ||||||
|     categories: science |  | ||||||
|     timeout: 4.0 |     timeout: 4.0 | ||||||
| 
 | 
 | ||||||
|   # tmp suspended:  dh key too small |   # tmp suspended:  dh key too small | ||||||
| @ -411,23 +410,9 @@ engines: | |||||||
|   #   api_key: 'unset' |   #   api_key: 'unset' | ||||||
| 
 | 
 | ||||||
|   - name: crossref |   - name: crossref | ||||||
|     engine: json_engine |     engine: crossref | ||||||
|     paging: true |  | ||||||
|     search_url: https://search.crossref.org/dois?q={query}&page={pageno} |  | ||||||
|     url_query: doi |  | ||||||
|     title_query: title |  | ||||||
|     title_html_to_text: true |  | ||||||
|     content_query: fullCitation |  | ||||||
|     content_html_to_text: true |  | ||||||
|     categories: science |  | ||||||
|     shortcut: cr |     shortcut: cr | ||||||
|     about: |     timeout: 10 | ||||||
|       website: https://www.crossref.org/ |  | ||||||
|       wikidata_id: Q5188229 |  | ||||||
|       official_api_documentation: https://github.com/CrossRef/rest-api-doc |  | ||||||
|       use_official_api: false |  | ||||||
|       require_api_key: false |  | ||||||
|       results: JSON |  | ||||||
| 
 | 
 | ||||||
|   - name: yep |   - name: yep | ||||||
|     engine: json_engine |     engine: json_engine | ||||||
| @ -1068,7 +1053,7 @@ engines: | |||||||
|     title_query: metadata/oaf:entity/oaf:result/title/$ |     title_query: metadata/oaf:entity/oaf:result/title/$ | ||||||
|     content_query: metadata/oaf:entity/oaf:result/description/$ |     content_query: metadata/oaf:entity/oaf:result/description/$ | ||||||
|     content_html_to_text: true |     content_html_to_text: true | ||||||
|     categories: science |     categories: "science" | ||||||
|     shortcut: oad |     shortcut: oad | ||||||
|     timeout: 5.0 |     timeout: 5.0 | ||||||
|     about: |     about: | ||||||
| @ -1198,7 +1183,6 @@ engines: | |||||||
|   - name: pubmed |   - name: pubmed | ||||||
|     engine: pubmed |     engine: pubmed | ||||||
|     shortcut: pub |     shortcut: pub | ||||||
|     categories: science |  | ||||||
|     timeout: 3.0 |     timeout: 3.0 | ||||||
| 
 | 
 | ||||||
|   - name: pypi |   - name: pypi | ||||||
| @ -1346,7 +1330,6 @@ engines: | |||||||
|     engine: semantic_scholar |     engine: semantic_scholar | ||||||
|     disabled: true |     disabled: true | ||||||
|     shortcut: se |     shortcut: se | ||||||
|     categories: science |  | ||||||
| 
 | 
 | ||||||
|   # Spotify needs API credentials |   # Spotify needs API credentials | ||||||
|   # - name: spotify |   # - name: spotify | ||||||
| @ -1372,8 +1355,7 @@ engines: | |||||||
|   #   # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601" |   #   # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601" | ||||||
|   #   api_key: 'unset' |   #   api_key: 'unset' | ||||||
|   #   shortcut: springer |   #   shortcut: springer | ||||||
|   #   categories: science |   #   timeout: 15.0 | ||||||
|   #   timeout: 6.0 |  | ||||||
| 
 | 
 | ||||||
|   - name: startpage |   - name: startpage | ||||||
|     engine: startpage |     engine: startpage | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user