[enh] bing_images: use data from embedded JSON to improve results (e.g. real page title) (#1568)
use data from embedded JSON to improve results (e.g. real page title), add image format and source info (see PR #1567), improve paging logic (it now works)
This commit is contained in:
		
							parent
							
								
									f34b5cedb1
								
							
						
					
					
						commit
						11fc9913e9
					
				| @ -10,9 +10,6 @@ | |||||||
|  @stable      no (HTML can change) |  @stable      no (HTML can change) | ||||||
|  @parse       url, title, img_src |  @parse       url, title, img_src | ||||||
| 
 | 
 | ||||||
|  @todo        currently there are up to 35 images receive per page, |  | ||||||
|               because bing does not parse count=10. |  | ||||||
|               limited response to 10 images |  | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from lxml import html | from lxml import html | ||||||
| @ -28,10 +25,15 @@ safesearch = True | |||||||
| time_range_support = True | time_range_support = True | ||||||
| language_support = True | language_support = True | ||||||
| supported_languages_url = 'https://www.bing.com/account/general' | supported_languages_url = 'https://www.bing.com/account/general' | ||||||
|  | number_of_results = 28 | ||||||
| 
 | 
 | ||||||
| # search-url | # search-url | ||||||
| base_url = 'https://www.bing.com/' | base_url = 'https://www.bing.com/' | ||||||
| search_string = 'images/search?{query}&count=10&first={offset}' | search_string = 'images/search'\ | ||||||
|  |     '?{query}'\ | ||||||
|  |     '&count={count}'\ | ||||||
|  |     '&first={first}'\ | ||||||
|  |     '&FORM=IBASEP' | ||||||
| time_range_string = '&qft=+filterui:age-lt{interval}' | time_range_string = '&qft=+filterui:age-lt{interval}' | ||||||
| time_range_dict = {'day': '1440', | time_range_dict = {'day': '1440', | ||||||
|                    'week': '10080', |                    'week': '10080', | ||||||
| @ -44,16 +46,14 @@ safesearch_types = {2: 'STRICT', | |||||||
|                     0: 'OFF'} |                     0: 'OFF'} | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # do search-request | # do search-request | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     offset = (params['pageno'] - 1) * 10 + 1 |     offset = ((params['pageno'] - 1) * number_of_results) + 1 | ||||||
| 
 | 
 | ||||||
|     search_path = search_string.format( |     search_path = search_string.format( | ||||||
|         query=urlencode({'q': query}), |         query=urlencode({'q': query}), | ||||||
|         offset=offset) |         count=number_of_results, | ||||||
|  |         first=offset) | ||||||
| 
 | 
 | ||||||
|     language = match_language(params['language'], supported_languages, language_aliases).lower() |     language = match_language(params['language'], supported_languages, language_aliases).lower() | ||||||
| 
 | 
 | ||||||
| @ -77,32 +77,31 @@ def response(resp): | |||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     for result in dom.xpath('//div[@id="mmComponent_images_1"]/ul/li/div/div[@class="imgpt"]'): |     for result in dom.xpath('//div[@class="imgpt"]'): | ||||||
|         link = result.xpath('./a')[0] |  | ||||||
| 
 | 
 | ||||||
|         # TODO find actual title |         img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0] | ||||||
|         title = link.xpath('.//img/@alt')[0] |         # Microsoft seems to experiment with this code so don't make the path too specific, | ||||||
|  |         # just catch the text section for the first anchor in img_info assuming this to be | ||||||
|  |         # the originating site. | ||||||
|  |         source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0] | ||||||
| 
 | 
 | ||||||
|         # parse json-data (it is required to add a space, to make it parsable) |         try: | ||||||
|         json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m'))) |             m = loads(result.xpath('./a/@m')[0]) | ||||||
| 
 | 
 | ||||||
|         url = json_data.get('purl') |             # strip 'Unicode private use area' highlighting, they render to Tux | ||||||
|         img_src = json_data.get('murl') |             # the Linux penguin and a standing diamond on my machine... | ||||||
|         thumbnail = json_data.get('turl') |             title = m.get('t', '').replace(u'\ue000', '').replace(u'\ue001', '') | ||||||
|  |             results.append({'template': 'images.html', | ||||||
|  |                             'url': m['purl'], | ||||||
|  |                             'thumbnail_src': m['turl'], | ||||||
|  |                             'img_src': m['murl'], | ||||||
|  |                             'content': '', | ||||||
|  |                             'title': title, | ||||||
|  |                             'source': source, | ||||||
|  |                             'img_format': img_format}) | ||||||
|  |         except: | ||||||
|  |             continue | ||||||
| 
 | 
 | ||||||
|         # append result |  | ||||||
|         results.append({'template': 'images.html', |  | ||||||
|                         'url': url, |  | ||||||
|                         'title': title, |  | ||||||
|                         'content': '', |  | ||||||
|                         'thumbnail_src': thumbnail, |  | ||||||
|                         'img_src': img_src}) |  | ||||||
| 
 |  | ||||||
|         # TODO stop parsing if 10 images are found |  | ||||||
|         # if len(results) >= 10: |  | ||||||
|         #     break |  | ||||||
| 
 |  | ||||||
|     # return results |  | ||||||
|     return results |     return results | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -53,17 +53,25 @@ class TestBingImagesEngine(SearxTestCase): | |||||||
|                 <li> |                 <li> | ||||||
|                     <div> |                     <div> | ||||||
|                         <div class="imgpt"> |                         <div class="imgpt"> | ||||||
|                             <a m='{"purl":"page_url","murl":"img_url","turl":"thumb_url"}'> |                             <a m='{"purl":"page_url","murl":"img_url","turl":"thumb_url","t":"Page 1 title"}'> | ||||||
|                                 <img src="" alt="alt text" /> |                                 <img src="" alt="alt text" /> | ||||||
|                             </a> |                             </a> | ||||||
|  |                             <div class="img_info"> | ||||||
|  |                                 <span>1 x 1 - jpeg</span> | ||||||
|  |                                 <a>1.example.org</a> | ||||||
|  |                             </div> | ||||||
|                         </div> |                         </div> | ||||||
|                         <div></div> |                         <div></div> | ||||||
|                     </div> |                     </div> | ||||||
|                     <div> |                     <div> | ||||||
|                         <div class="imgpt"> |                         <div class="imgpt"> | ||||||
|                             <a m='{"purl":"page_url2","murl":"img_url2","turl":"thumb_url2"}'> |                             <a m='{"purl":"page_url2","murl":"img_url2","turl":"thumb_url2","t":"Page 2 title"}'> | ||||||
|                                 <img src="" alt="alt text 2" /> |                                 <img src="" alt="alt text 2" /> | ||||||
|                             </a> |                             </a> | ||||||
|  |                             <div class="img_info"> | ||||||
|  |                                 <span>2 x 2 - jpeg</span> | ||||||
|  |                                 <a>2.example.org</a> | ||||||
|  |                             </div> | ||||||
|                         </div> |                         </div> | ||||||
|                     </div> |                     </div> | ||||||
|                 </li> |                 </li> | ||||||
| @ -72,9 +80,13 @@ class TestBingImagesEngine(SearxTestCase): | |||||||
|                 <li> |                 <li> | ||||||
|                     <div> |                     <div> | ||||||
|                         <div class="imgpt"> |                         <div class="imgpt"> | ||||||
|                             <a m='{"purl":"page_url3","murl":"img_url3","turl":"thumb_url3"}'> |                             <a m='{"purl":"page_url3","murl":"img_url3","turl":"thumb_url3","t":"Page 3 title"}'> | ||||||
|                                 <img src="" alt="alt text 3" /> |                                 <img src="" alt="alt text 3" /> | ||||||
|                             </a> |                             </a> | ||||||
|  |                             <div class="img_info"> | ||||||
|  |                                 <span>3 x 3 - jpeg</span> | ||||||
|  |                                 <a>3.example.org</a> | ||||||
|  |                             </div> | ||||||
|                         </div> |                         </div> | ||||||
|                     </div> |                     </div> | ||||||
|                 </li> |                 </li> | ||||||
| @ -86,11 +98,13 @@ class TestBingImagesEngine(SearxTestCase): | |||||||
|         results = bing_images.response(response) |         results = bing_images.response(response) | ||||||
|         self.assertEqual(type(results), list) |         self.assertEqual(type(results), list) | ||||||
|         self.assertEqual(len(results), 3) |         self.assertEqual(len(results), 3) | ||||||
|         self.assertEqual(results[0]['title'], 'alt text') |         self.assertEqual(results[0]['title'], 'Page 1 title') | ||||||
|         self.assertEqual(results[0]['url'], 'page_url') |         self.assertEqual(results[0]['url'], 'page_url') | ||||||
|         self.assertEqual(results[0]['content'], '') |         self.assertEqual(results[0]['content'], '') | ||||||
|         self.assertEqual(results[0]['thumbnail_src'], 'thumb_url') |         self.assertEqual(results[0]['thumbnail_src'], 'thumb_url') | ||||||
|         self.assertEqual(results[0]['img_src'], 'img_url') |         self.assertEqual(results[0]['img_src'], 'img_url') | ||||||
|  |         self.assertEqual(results[0]['img_format'], '1 x 1 - jpeg') | ||||||
|  |         self.assertEqual(results[0]['source'], '1.example.org') | ||||||
| 
 | 
 | ||||||
|     def test_fetch_supported_languages(self): |     def test_fetch_supported_languages(self): | ||||||
|         html = """ |         html = """ | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user