| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  | from urllib import urlencode | 
					
						
							|  |  |  | from HTMLParser import HTMLParser | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | url = 'http://www.filecrop.com/' | 
					
						
							| 
									
										
										
										
											2014-01-30 01:14:08 +01:00
										 |  |  | search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}'  # noqa | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | paging = True | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | class FilecropResultParser(HTMLParser): | 
					
						
							| 
									
										
										
										
											2016-07-10 16:44:27 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |     def __init__(self): | 
					
						
							|  |  |  |         HTMLParser.__init__(self) | 
					
						
							|  |  |  |         self.__start_processing = False | 
					
						
							| 
									
										
										
										
											2014-01-05 00:46:42 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |         self.results = [] | 
					
						
							|  |  |  |         self.result = {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         self.tr_counter = 0 | 
					
						
							|  |  |  |         self.data_counter = 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def handle_starttag(self, tag, attrs): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if tag == 'tr': | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  |             if ('bgcolor', '#edeff5') in attrs or\ | 
					
						
							|  |  |  |                ('bgcolor', '#ffffff') in attrs: | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |                 self.__start_processing = True | 
					
						
							| 
									
										
										
										
											2014-01-05 00:46:42 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |         if not self.__start_processing: | 
					
						
							|  |  |  |             return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if tag == 'label': | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  |             self.result['title'] = [attr[1] for attr in attrs | 
					
						
							|  |  |  |                                     if attr[0] == 'title'][0] | 
					
						
							|  |  |  |         elif tag == 'a' and ('rel', 'nofollow') in attrs\ | 
					
						
							|  |  |  |                 and ('class', 'sourcelink') in attrs: | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |             if 'content' in self.result: | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  |                 self.result['content'] += [attr[1] for attr in attrs | 
					
						
							|  |  |  |                                            if attr[0] == 'title'][0] | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  |                 self.result['content'] = [attr[1] for attr in attrs | 
					
						
							|  |  |  |                                           if attr[0] == 'title'][0] | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |             self.result['content'] += ' ' | 
					
						
							|  |  |  |         elif tag == 'a': | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  |             self.result['url'] = url + [attr[1] for attr in attrs | 
					
						
							|  |  |  |                                         if attr[0] == 'href'][0] | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def handle_endtag(self, tag): | 
					
						
							|  |  |  |         if self.__start_processing is False: | 
					
						
							|  |  |  |             return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if tag == 'tr': | 
					
						
							|  |  |  |             self.tr_counter += 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if self.tr_counter == 2: | 
					
						
							|  |  |  |                 self.__start_processing = False | 
					
						
							|  |  |  |                 self.tr_counter = 0 | 
					
						
							|  |  |  |                 self.data_counter = 0 | 
					
						
							|  |  |  |                 self.results.append(self.result) | 
					
						
							|  |  |  |                 self.result = {} | 
					
						
							| 
									
										
										
										
											2014-01-05 00:46:42 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |     def handle_data(self, data): | 
					
						
							|  |  |  |         if not self.__start_processing: | 
					
						
							|  |  |  |             return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if 'content' in self.result: | 
					
						
							|  |  |  |             self.result['content'] += data + ' ' | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             self.result['content'] = data + ' ' | 
					
						
							| 
									
										
										
										
											2014-01-05 00:46:42 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |         self.data_counter += 1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  | def request(query, params): | 
					
						
							| 
									
										
										
										
											2014-01-30 01:14:08 +01:00
										 |  |  |     index = 1 + (params['pageno'] - 1) * 30 | 
					
						
							|  |  |  |     params['url'] = search_url.format(query=urlencode({'w': query}), | 
					
						
							|  |  |  |                                       index=index) | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-01-20 02:31:20 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-12-29 16:21:20 +01:00
										 |  |  | def response(resp): | 
					
						
							|  |  |  |     parser = FilecropResultParser() | 
					
						
							|  |  |  |     parser.feed(resp.text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return parser.results |