- Moved everything to trunk to create a stable branch

This commit is contained in:
Christophe Dumez
2006-09-30 16:02:39 +00:00
commit 969a02b93e
200 changed files with 46382 additions and 0 deletions

432
src/search_engine/nova.py Executable file
View File

@@ -0,0 +1,432 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Version: 1.7
# Changelog:
# - merged with qbittorrent branch (code cleanup, indentation mistakes)
# - separate standalone and slave mode
# - added btjunkie
# - added meganova
# - added multithreaded mode
# Version: 1.6
# Changelog:
# - size is now always returned in bytes
# - seeders/leechers are now always returned as integers
# - cleaned up code
# - results are now displayed in real time
# - fixed piratebay, torrentreactor
# Author:
# Fabien Devaux <fab AT gnux DOT info>
# Contributors:
# Christophe Dumez <chris@qbittorrent.org> (qbittorrent integration)
# Thanks to gab #gcu @ irc.freenode.net (multipage support on PirateBay)
# Thanks to Elias <gekko04@users.sourceforge.net> (torrentreactor and isohunt search engines)
#
# Licence: BSD
import sys
import urllib
import sgmllib
from xml.dom import minidom
import re
import os
import cgi
import traceback
import threading
STANDALONE = True
THREADED = True
if os.environ.has_key('QBITTORRENT'):
STANDALONE = False
THREADED = False
def prettyPrinter(dictionnary):
print "%(link)s|%(name)s|%(size)s|%(seeds)s|%(leech)s|%(engine_url)s"%dictionnary
if STANDALONE:
def termPrettyPrinter(dictionnary):
dictionnary['size'] = bytesToHuman(dictionnary['size'])
print "%(seeds)5s/%(leech)5s | %(size)10s | %(name)s "%dictionnary
print "wget '%s'"%dictionnary['link'].replace("'","\\'")
globals()['prettyPrinter'] = termPrettyPrinter
def bytesToHuman(filesize):
"""
Convert float (size in bytes) to readable string
"""
decimators = ('k','M','G','T')
unit = ''
for n in range(len(decimators)):
if filesize > 1100.0:
filesize /= 1024.0
unit = decimators[n]
return '%.1f%sB'%(filesize, unit)
def anySizeToBytes(size_string):
"""
Convert a string like '1 KB' to '1024' (bytes)
"""
# separate integer from unit
try:
size, unit = size_string.split()
except (ValueError, TypeError):
try:
size = size_string.strip()
unit = ''.join([c for c in size if c.isalpha()])
size = size[:-len(unit)]
except(ValueError, TypeError):
return -1
size = float(size)
short_unit = unit.upper()[0]
# convert
units_dict = { 'T': 40, 'G': 30, 'M': 20, 'K': 10 }
if units_dict.has_key( short_unit ):
size = size * 2**units_dict[short_unit]
return int(size)
################################################################################
# Every engine should have a "search" method taking
# a space-free string as parameter (ex. "family+guy")
# it should call prettyPrinter() with a dict as parameter
# see above for dict keys
# As a convention, try to list results by decrasing number of seeds or similar
################################################################################
class PirateBay(object):
url = 'http://thepiratebay.org'
def __init__(self):
self.results = []
self.parser = self.SimpleSGMLParser(self.results, self.url)
class SimpleSGMLParser(sgmllib.SGMLParser):
def __init__(self, results, url, *args):
sgmllib.SGMLParser.__init__(self)
self.td_counter = None
self.current_item = None
self.results = results
self.url = url
def start_a(self, attr):
params = dict(attr)
if params['href'].startswith('/browse'):
self.current_item = {}
self.td_counter = 0
elif params['href'].startswith('http://torrents.thepiratebay.org/hashtorrent'):
self.current_item['link']=params['href'].strip()
self.td_counter = self.td_counter+1
def handle_data(self, data):
if self.td_counter == 1:
if not self.current_item.has_key('name'):
self.current_item['name'] = ''
self.current_item['name']+= data.strip()
if self.td_counter == 5:
if not self.current_item.has_key('size'):
self.current_item['size'] = ''
self.current_item['size']+= data.strip()
elif self.td_counter == 6:
if not self.current_item.has_key('seeds'):
self.current_item['seeds'] = ''
self.current_item['seeds']+= data.strip()
elif self.td_counter == 7:
if not self.current_item.has_key('leech'):
self.current_item['leech'] = ''
self.current_item['leech']+= data.strip()
def start_td(self,attr):
if isinstance(self.td_counter,int):
self.td_counter += 1
if self.td_counter > 7:
self.td_counter = None
# Display item
if self.current_item:
self.current_item['engine_url'] = self.url
self.current_item['size']= anySizeToBytes(self.current_item['size'])
if not self.current_item['seeds'].isdigit():
self.current_item['seeds'] = 0
if not self.current_item['leech'].isdigit():
self.current_item['leech'] = 0
prettyPrinter(self.current_item)
self.results.append('a')
def search(self, what):
ret = []
i = 0
order = 'se'
while True:
results = []
parser = self.SimpleSGMLParser(results, self.url)
dat = urllib.urlopen(self.url+'/search.php?q=%s&orderby=%s&what=search&page=%u' % (what, order, i)).read()
parser.feed(dat)
parser.close()
if len(results) <= 0:
break
i += 1
class Mininova(object):
url = 'http://www.mininova.org'
table_items = 'added cat name size seeds leech'.split()
def search(self, what):
order = 'seeds' # must be one in self.table_items
def get_link(lnk):
lnk = lnk.getElementsByTagName('a').item(0)
return self.url+lnk.attributes.get('href').value
def get_text(txt):
if txt.nodeType == txt.TEXT_NODE:
return txt.toxml()
else:
return ''.join([ get_text(n) for n in txt.childNodes])
dat = urllib.urlopen(self.url+'/search/%s/%s'%(what,order)).read().decode('utf-8', 'replace')
x = minidom.parseString(dat.encode('utf-8', 'replace'))
table = x.getElementsByTagName('table').item(0)
if not table: return
for tr in table.getElementsByTagName('tr'):
tds = tr.getElementsByTagName('td')
if tds:
i = 0
vals = {}
for td in tds:
if self.table_items[i] == 'name':
vals['link'] = get_link(td).strip()
vals[self.table_items[i]] = get_text(td).strip()
i += 1
vals['engine_url'] = self.url
vals['size']= anySizeToBytes(vals['size'])
if not vals['seeds'].isdigit():
vals['seeds'] = 0
if not vals['leech'].isdigit():
vals['leech'] = 0
prettyPrinter(vals)
# TODO: add multipage
class BtJunkie(object):
url = 'http://btjunkie.org'
def search(self, what):
dat = urllib.urlopen(self.url+'/search?q=%s'%what).read().decode('utf8', 'replace')
# I know it's not very readable, but the SGML parser feels in pain
section_re = re.compile('(?s)href="/torrent\?do=download.*?<tr>')
torrent_re = re.compile('(?s)href="(?P<link>.*?do=download[^"]+).*?'
'<b>(?P<name>.*?)</b>.*?'
'>(?P<size>\d+MB)</font>.*?'
'>(?P<seeds>\d+)</font>.*?'
'>(?P<leech>\d+)</font>')
for match in section_re.finditer(dat):
txt = match.group(0)
m = torrent_re.search(txt)
if m:
torrent_infos = m.groupdict()
torrent_infos['engine_url'] = self.url
torrent_infos['size'] = anySizeToBytes(torrent_infos['size'])
torrent_infos['link'] = self.url+torrent_infos['link']
prettyPrinter(torrent_infos)
class MegaNova(object):
url = 'http://www.meganova.org'
def search(self, what):
dat = urllib.urlopen(self.url+'/search.php?order=5&search=%s'%what).read().decode('utf8', 'replace')
# I know it's not very readable, but the SGML parser feels in pain
section_re = re.compile('(?s)<td class="added".*?</tr')
torrent_re = re.compile('(?s)href="(?P<link>/torrent/.*?)".*?'
'<span.*?>(?P<name>.*?)</span>.*?'
'>(?P<size>[0-9.]+\s+.B).*?'
'>(?P<seeds>\d+)<.*?'
'>(?P<leech>\d+)<')
for match in section_re.finditer(dat):
txt = match.group(0)
m = torrent_re.search(txt)
if m:
torrent_infos = m.groupdict()
torrent_infos['engine_url'] = self.url
torrent_infos['size'] = anySizeToBytes(torrent_infos['size'])
torrent_infos['link'] = self.url+torrent_infos['link']
prettyPrinter(torrent_infos)
class Reactor(object):
url = 'http://tr.searching.com'
class SimpleSGMLParser(sgmllib.SGMLParser):
def __init__(self, results, url, *args):
sgmllib.SGMLParser.__init__(self)
self.td_counter = None
self.current_item = None
self.results = results
self.id = None
self.url = url
def start_a(self, attr):
params = dict(attr)
if params['href'].startswith('view.php'):
self.current_item = {}
self.td_counter = 0
# extract the torrent id
#I save it in a global variable for after create the link string
equal = params['href'].find("=")
self.id = str(int(params['href'][equal+1:]))
def handle_data(self, data):
if self.td_counter == 0:
if not self.current_item.has_key('name'):
self.current_item['name'] = ''
self.current_item['name']+= data.strip()
if self.td_counter == 1:
if not self.current_item.has_key('size'):
self.current_item['size'] = ''
self.current_item['size']+= data.strip()
elif self.td_counter == 2:
if not self.current_item.has_key('seeds'):
self.current_item['seeds'] = ''
self.current_item['seeds']+= data.strip()
elif self.td_counter == 3:
if not self.current_item.has_key('leech'):
self.current_item['leech'] = ''
self.current_item['leech']+= data.strip()
def start_td(self,attr):
if isinstance(self.td_counter,int):
self.td_counter += 1
if self.td_counter > 7:
self.td_counter = None
# add item to results
if self.current_item:
self.current_item['link']='http://download.torrentreactor.net/download.php?name=%s&id=%s'%(cgi.escape(self.current_item['name']),self.id)
self.current_item['engine_url'] = self.url
self.current_item['size']= anySizeToBytes(self.current_item['size'])
if not self.current_item['seeds'].isdigit():
self.current_item['seeds'] = 0
if not self.current_item['leech'].isdigit():
self.current_item['leech'] = 0
prettyPrinter(self.current_item)
self.has_results = True
self.results.append('a')
def __init__(self):
self.results = []
self.parser = self.SimpleSGMLParser(self.results, self.url)
def search(self, what):
i = 0
while True:
results = []
parser = self.SimpleSGMLParser(results, self.url)
dat = urllib.urlopen(self.url+'/search.php?search=&words=%s&skip=%s'%(what,(i*50))).read().decode('utf-8', 'replace')
parser.feed(dat)
parser.close()
if len(results) <= 0:
break
i += 1
class Isohunt(object):
url = 'http://isohunt.com'
class SimpleSGMLParser(sgmllib.SGMLParser):
def __init__(self, results, url, *args):
sgmllib.SGMLParser.__init__(self)
self.td_counter = None
self.current_item = None
self.results = results
self.url = url
def start_tr(self, attr):
params = dict(attr)
if 'onclick' in params:
Durl='http://isohunt.com/dl.php?id='
self.current_item = {}
self.td_counter = 0
begin_id = params['onclick'].find("id=")+3
end_id = params['onclick'][begin_id:].find("'")
self.current_item['link'] = '%s%s'%(Durl,str(params['onclick'][begin_id:begin_id+end_id]))
def handle_data(self, data):
if self.td_counter == 3:
if not self.current_item.has_key('name'):
self.current_item['name'] = ''
self.current_item['name']+= data.strip()
if self.td_counter == 4:
if not self.current_item.has_key('size'):
self.current_item['size'] = ''
self.current_item['size']+= data.strip()
if self.td_counter == 5:
if not self.current_item.has_key('seeds'):
self.current_item['seeds'] = ''
self.current_item['seeds']+= data.strip()
if self.td_counter == 6:
if not self.current_item.has_key('leech'):
self.current_item['leech'] = ''
self.current_item['leech']+= data.strip()
def start_td(self,attr):
if isinstance(self.td_counter,int):
self.td_counter += 1
if self.td_counter > 7:
self.td_counter = None
# add item to results
if self.current_item:
self.current_item['engine_url'] = self.url
self.current_item['size']= anySizeToBytes(self.current_item['size'])
if not self.current_item.has_key('seeds') or not self.current_item['seeds'].isdigit():
self.current_item['seeds'] = 0
if not self.current_item.has_key('leech') or not self.current_item['leech'].isdigit():
self.current_item['leech'] = 0
prettyPrinter(self.current_item)
self.results.append('a')
def __init__(self):
self.results = []
self.parser = self.SimpleSGMLParser(self.results, self.url)
def search(self, what):
i = 1
while True:
results = []
parser = self.SimpleSGMLParser(results, self.url)
dat = urllib.urlopen(self.url+'/torrents.php?ihq=%s&ihp=%s'%(what,i)).read().decode('utf-8', 'replace')
parser.feed(dat)
parser.close()
if len(results) <= 0:
break
i += 1
class EngineLauncher(threading.Thread):
def __init__(self, engine, what):
threading.Thread.__init__(self)
self.engine = engine
self.what = what
def run(self):
self.engine.search(self.what)
if __name__ == '__main__':
available_engines_list = BtJunkie, MegaNova, Mininova, PirateBay, Reactor, Isohunt
if len(sys.argv) < 3:
raise SystemExit('./nova.py <all|engine1[,engine2]*> <keywords>\navailable engines: %s'%
(','.join(e.__name__ for e in available_engines_list)))
engines_list = [e.lower() for e in sys.argv[1].strip().split(',')]
what = '+'.join(sys.argv[2:])
if 'all' in engines_list:
engines_list = [e.__name__.lower() for e in available_engines_list]
selected_engines = set(e for e in available_engines_list if e.__name__.lower() in engines_list)
for engine in selected_engines:
try:
if THREADED:
EngineLauncher( engine(), what ).start()
else:
engine().search(what)
except:
if STANDALONE:
traceback.print_exc()