| 
									
										
										
										
											2015-05-02 15:45:17 +02:00
										 |  |  | """
 | 
					
						
							|  |  |  |  General Files (Files) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @website     http://www.general-files.org | 
					
						
							|  |  |  |  @provide-api no (nothing found) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @using-api   no (because nothing found) | 
					
						
							|  |  |  |  @results     HTML (using search portal) | 
					
						
							|  |  |  |  @stable      no (HTML can change) | 
					
						
							|  |  |  |  @parse       url, title, content | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @todo        detect torrents? | 
					
						
							|  |  |  | """
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  | from lxml import html | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | # engine dependent config | 
					
						
							|  |  |  | categories = ['files'] | 
					
						
							|  |  |  | paging = True | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | # search-url | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  | base_url = 'http://www.general-file.com' | 
					
						
							|  |  |  | search_url = base_url + '/files-{letter}/{query}/{pageno}' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | # specific xpath variables | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  | result_xpath = '//table[@class="block-file"]' | 
					
						
							|  |  |  | title_xpath = './/h2/a//text()' | 
					
						
							|  |  |  | url_xpath = './/h2/a/@href' | 
					
						
							|  |  |  | content_xpath = './/p//text()' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | # do search-request | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  | def request(query, params): | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  |     params['url'] = search_url.format(query=query, | 
					
						
							|  |  |  |                                       letter=query[0], | 
					
						
							|  |  |  |                                       pageno=params['pageno']) | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  |     return params | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | # get response from search-request | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  | def response(resp): | 
					
						
							|  |  |  |     results = [] | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  |     dom = html.fromstring(resp.text) | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # parse results | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  |     for result in dom.xpath(result_xpath): | 
					
						
							|  |  |  |         url = result.xpath(url_xpath)[0] | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  |         # skip fast download links | 
					
						
							|  |  |  |         if not url.startswith('/'): | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # append result | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  |         results.append({'url': base_url + url, | 
					
						
							|  |  |  |                         'title': ''.join(result.xpath(title_xpath)), | 
					
						
							|  |  |  |                         'content': ''.join(result.xpath(content_xpath))}) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 |  |  |     # return results | 
					
						
							| 
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 |  |  |     return results |