2015-05-02 15:45:17 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								"""
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 General Files (Files)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 @website     http://www.general-files.org
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 @provide-api no (nothing found)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 @using-api   no (because nothing found)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 @results     HTML (using search portal)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 @stable      no (HTML can change)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 @parse       url, title, content
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								 @todo        detect torrents?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								"""
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from lxml import html
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								# engine dependent config
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								categories = ['files']
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								paging = True
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								# search-url
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								base_url = 'http://www.general-file.com'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								search_url = base_url + '/files-{letter}/{query}/{pageno}'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								# specific xpath variables
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								result_xpath = '//table[@class="block-file"]'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								title_xpath = './/h2/a//text()'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								url_xpath = './/h2/a/@href'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								content_xpath = './/p//text()'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								# do search-request
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def request(query, params):
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    params['url'] = search_url.format(query=query,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                      letter=query[0],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                                      pageno=params['pageno'])
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return params
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								# get response from search-request
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def response(resp):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    results = []
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    dom = html.fromstring(resp.text)
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # parse results
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for result in dom.xpath(result_xpath):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        url = result.xpath(url_xpath)[0]
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # skip fast download links
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not url.startswith('/'):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            continue
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        # append result
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        results.append({'url': base_url + url,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        'title': ''.join(result.xpath(title_xpath)),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        'content': ''.join(result.xpath(content_xpath))})
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-09-02 17:28:35 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    # return results
							 | 
						
					
						
							
								
									
										
										
										
											2014-06-27 17:25:16 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return results
							 |