- Moved everything to trunk to create a stable branch

2026-01-06 15:42:32 -06:00 · 2006-09-30 16:02:39 +00:00
commit 969a02b93e
200 changed files with 46382 additions and 0 deletions
--- a/src/search_engine/nova.py
+++ b/src/search_engine/nova.py
@@ -0,0 +1,432 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Version: 1.7
+# Changelog:
+# - merged with qbittorrent branch (code cleanup, indentation mistakes)
+# - separate standalone and slave mode
+# - added btjunkie
+# - added meganova
+# - added multithreaded mode
+
+# Version: 1.6
+# Changelog:
+# - size is now always returned in bytes
+# - seeders/leechers are now always returned as integers
+# - cleaned up code
+# - results are now displayed in real time
+# - fixed piratebay, torrentreactor
+
+# Author:
+#  Fabien Devaux <fab AT gnux DOT info>
+# Contributors:
+#  Christophe Dumez <chris@qbittorrent.org> (qbittorrent integration)
+#  Thanks to gab #gcu @ irc.freenode.net (multipage support on PirateBay)
+#  Thanks to Elias <gekko04@users.sourceforge.net> (torrentreactor and isohunt search engines)
+#
+# Licence: BSD
+
+import sys
+import urllib
+import sgmllib
+from xml.dom import minidom
+import re
+import os
+import cgi
+import traceback
+import threading
+
+STANDALONE = True
+THREADED = True
+
+if os.environ.has_key('QBITTORRENT'):
+	STANDALONE = False
+	THREADED = False
+
+def prettyPrinter(dictionnary):
+	print "%(link)s|%(name)s|%(size)s|%(seeds)s|%(leech)s|%(engine_url)s"%dictionnary
+
+if STANDALONE:
+	def termPrettyPrinter(dictionnary):
+		dictionnary['size'] = bytesToHuman(dictionnary['size'])
+		print "%(seeds)5s/%(leech)5s | %(size)10s | %(name)s "%dictionnary
+		print "wget '%s'"%dictionnary['link'].replace("'","\\'")
+
+	globals()['prettyPrinter'] = termPrettyPrinter
+
+def bytesToHuman(filesize):
+	"""
+	Convert float (size in bytes) to readable string
+	"""
+	decimators = ('k','M','G','T')
+	unit = ''
+	for n in range(len(decimators)):
+		if filesize > 1100.0:
+			filesize /= 1024.0
+			unit = decimators[n]
+	return '%.1f%sB'%(filesize, unit)
+
+def anySizeToBytes(size_string):
+	"""
+	Convert a string like '1 KB' to '1024' (bytes)
+	"""
+	# separate integer from unit
+	try:
+		size, unit = size_string.split()
+	except (ValueError, TypeError):
+		try:
+			size = size_string.strip()
+			unit = ''.join([c for c in size if c.isalpha()])
+			size = size[:-len(unit)]
+		except(ValueError, TypeError):
+			return -1
+
+	size = float(size)
+	short_unit = unit.upper()[0]
+
+	# convert
+	units_dict = { 'T': 40, 'G': 30, 'M': 20, 'K': 10 }
+	if units_dict.has_key( short_unit ):
+		size = size * 2**units_dict[short_unit]
+	return int(size)
+
+################################################################################
+# Every engine should have a "search" method taking
+# a space-free string as parameter (ex. "family+guy")
+# it should call prettyPrinter() with a dict as parameter
+# see above for dict keys
+# As a convention, try to list results by decrasing number of seeds or similar
+################################################################################
+
+class PirateBay(object):
+	url = 'http://thepiratebay.org'
+
+	def __init__(self):
+		self.results = []
+		self.parser = self.SimpleSGMLParser(self.results, self.url)
+
+	class SimpleSGMLParser(sgmllib.SGMLParser):
+		def __init__(self, results, url, *args):
+			sgmllib.SGMLParser.__init__(self)
+			self.td_counter = None
+			self.current_item = None
+			self.results = results
+			self.url = url
+
+		def start_a(self, attr):
+			params = dict(attr)
+			if params['href'].startswith('/browse'):
+				self.current_item = {}
+				self.td_counter = 0
+			elif params['href'].startswith('http://torrents.thepiratebay.org/hashtorrent'):
+				self.current_item['link']=params['href'].strip()
+				self.td_counter = self.td_counter+1
+
+		def handle_data(self, data):
+			if self.td_counter == 1:
+				if not self.current_item.has_key('name'):
+					self.current_item['name'] = ''
+				self.current_item['name']+= data.strip()
+			if self.td_counter == 5:
+				if not self.current_item.has_key('size'):
+					self.current_item['size'] = ''
+				self.current_item['size']+= data.strip()
+			elif self.td_counter == 6:
+				if not self.current_item.has_key('seeds'):
+					self.current_item['seeds'] = ''
+				self.current_item['seeds']+= data.strip()
+			elif self.td_counter == 7:
+				if not self.current_item.has_key('leech'):
+					self.current_item['leech'] = ''
+				self.current_item['leech']+= data.strip()
+
+		def start_td(self,attr):
+			if isinstance(self.td_counter,int):
+				self.td_counter += 1
+				if self.td_counter > 7:
+					self.td_counter = None
+					# Display item
+					if self.current_item:
+						self.current_item['engine_url'] = self.url
+						self.current_item['size']= anySizeToBytes(self.current_item['size'])
+						if not self.current_item['seeds'].isdigit():
+							self.current_item['seeds'] = 0
+						if not self.current_item['leech'].isdigit():
+							self.current_item['leech'] = 0
+						prettyPrinter(self.current_item)
+						self.results.append('a')
+	def search(self, what):
+		ret = []
+		i = 0
+		order = 'se'
+		while True:
+			results = []
+			parser = self.SimpleSGMLParser(results, self.url)
+			dat = urllib.urlopen(self.url+'/search.php?q=%s&orderby=%s&what=search&page=%u' % (what, order, i)).read()
+			parser.feed(dat)
+			parser.close()
+			if len(results) <= 0:
+				break
+			i += 1
+
+class Mininova(object):
+	url = 'http://www.mininova.org'
+	table_items = 'added cat name size seeds leech'.split()
+
+	def search(self, what):
+		order = 'seeds' # must be one in self.table_items
+
+		def get_link(lnk):
+			lnk = lnk.getElementsByTagName('a').item(0)
+			return self.url+lnk.attributes.get('href').value
+
+		def get_text(txt):
+			if txt.nodeType == txt.TEXT_NODE:
+				return txt.toxml()
+			else:
+				return ''.join([ get_text(n) for n in txt.childNodes])
+		dat = urllib.urlopen(self.url+'/search/%s/%s'%(what,order)).read().decode('utf-8', 'replace')
+		x = minidom.parseString(dat.encode('utf-8', 'replace'))
+		table = x.getElementsByTagName('table').item(0)
+		if not table: return
+		for tr in table.getElementsByTagName('tr'):
+			tds = tr.getElementsByTagName('td')
+			if tds:
+				i = 0
+				vals = {}
+				for td in tds:
+					if self.table_items[i] == 'name':
+						vals['link'] = get_link(td).strip()
+					vals[self.table_items[i]] = get_text(td).strip()
+					i += 1
+				vals['engine_url'] = self.url
+				vals['size']= anySizeToBytes(vals['size'])
+				if not vals['seeds'].isdigit():
+					vals['seeds'] = 0
+				if not vals['leech'].isdigit():
+					vals['leech'] = 0
+				prettyPrinter(vals)
+
+# TODO: add multipage
+class BtJunkie(object):
+	url = 'http://btjunkie.org'
+
+	def search(self, what):
+		dat = urllib.urlopen(self.url+'/search?q=%s'%what).read().decode('utf8', 'replace')
+		# I know it's not very readable, but the SGML parser feels in pain
+		section_re = re.compile('(?s)href="/torrent\?do=download.*?<tr>')
+		torrent_re = re.compile('(?s)href="(?P<link>.*?do=download[^"]+).*?'
+		'<b>(?P<name>.*?)</b>.*?'
+		'>(?P<size>\d+MB)</font>.*?'
+		'>(?P<seeds>\d+)</font>.*?'
+		'>(?P<leech>\d+)</font>')
+		for match in section_re.finditer(dat):
+			txt = match.group(0)
+			m = torrent_re.search(txt)
+			if m:
+				torrent_infos = m.groupdict()
+				torrent_infos['engine_url'] = self.url
+				torrent_infos['size'] = anySizeToBytes(torrent_infos['size'])
+				torrent_infos['link'] = self.url+torrent_infos['link']
+				prettyPrinter(torrent_infos)
+
+class MegaNova(object):
+	url = 'http://www.meganova.org'
+
+	def search(self, what):
+		dat = urllib.urlopen(self.url+'/search.php?order=5&search=%s'%what).read().decode('utf8', 'replace')
+		# I know it's not very readable, but the SGML parser feels in pain
+
+		section_re = re.compile('(?s)<td class="added".*?</tr')
+		torrent_re = re.compile('(?s)href="(?P<link>/torrent/.*?)".*?'
+		'<span.*?>(?P<name>.*?)</span>.*?'
+		'>(?P<size>[0-9.]+\s+.B).*?'
+		'>(?P<seeds>\d+)<.*?'
+		'>(?P<leech>\d+)<')
+
+		for match in section_re.finditer(dat):
+			txt = match.group(0)
+			m = torrent_re.search(txt)
+			if m:
+				torrent_infos = m.groupdict()
+				torrent_infos['engine_url'] = self.url
+				torrent_infos['size'] = anySizeToBytes(torrent_infos['size'])
+				torrent_infos['link'] = self.url+torrent_infos['link']
+				prettyPrinter(torrent_infos)
+
+class Reactor(object):
+	url = 'http://tr.searching.com'
+
+	class SimpleSGMLParser(sgmllib.SGMLParser):
+		def __init__(self, results, url, *args):
+			sgmllib.SGMLParser.__init__(self)
+			self.td_counter = None
+			self.current_item = None
+			self.results = results
+			self.id = None
+			self.url = url
+
+		def start_a(self, attr):
+			params = dict(attr)
+			if params['href'].startswith('view.php'):
+				self.current_item = {}
+				self.td_counter = 0
+				# extract the torrent id
+				#I save it in a global variable for after create the link string
+				equal = params['href'].find("=")
+				self.id = str(int(params['href'][equal+1:]))
+
+		def handle_data(self, data):
+			if self.td_counter == 0:
+				if not self.current_item.has_key('name'):
+					self.current_item['name'] = ''
+				self.current_item['name']+= data.strip()
+			if self.td_counter == 1:
+				if not self.current_item.has_key('size'):
+					self.current_item['size'] = ''
+				self.current_item['size']+= data.strip()
+			elif self.td_counter == 2:
+				if not self.current_item.has_key('seeds'):
+					self.current_item['seeds'] = ''
+				self.current_item['seeds']+= data.strip()
+			elif self.td_counter == 3:
+				if not self.current_item.has_key('leech'):
+					self.current_item['leech'] = ''
+				self.current_item['leech']+= data.strip()
+
+		def start_td(self,attr):
+			if isinstance(self.td_counter,int):
+				self.td_counter += 1
+				if self.td_counter > 7:
+					self.td_counter = None
+					# add item to results
+					if self.current_item:
+						self.current_item['link']='http://download.torrentreactor.net/download.php?name=%s&id=%s'%(cgi.escape(self.current_item['name']),self.id)
+						self.current_item['engine_url'] = self.url
+						self.current_item['size']= anySizeToBytes(self.current_item['size'])
+						if not self.current_item['seeds'].isdigit():
+							self.current_item['seeds'] = 0
+						if not self.current_item['leech'].isdigit():
+							self.current_item['leech'] = 0
+						prettyPrinter(self.current_item)
+						self.has_results = True
+						self.results.append('a')
+
+	def __init__(self):
+		self.results = []
+		self.parser = self.SimpleSGMLParser(self.results, self.url)
+
+	def search(self, what):
+		i = 0
+		while True:
+			results = []
+			parser = self.SimpleSGMLParser(results, self.url)
+			dat = urllib.urlopen(self.url+'/search.php?search=&words=%s&skip=%s'%(what,(i*50))).read().decode('utf-8', 'replace')
+			parser.feed(dat)
+			parser.close()
+			if len(results) <= 0:
+				break
+			i += 1
+
+class Isohunt(object):
+	url = 'http://isohunt.com'
+
+	class SimpleSGMLParser(sgmllib.SGMLParser):
+		def __init__(self, results, url, *args):
+			sgmllib.SGMLParser.__init__(self)
+			self.td_counter = None
+			self.current_item = None
+			self.results = results
+			self.url = url
+
+		def start_tr(self, attr):
+			params = dict(attr)
+			if 'onclick' in params:
+				Durl='http://isohunt.com/dl.php?id='
+				self.current_item = {}
+				self.td_counter = 0
+				begin_id = params['onclick'].find("id=")+3
+				end_id = params['onclick'][begin_id:].find("'")
+				self.current_item['link'] = '%s%s'%(Durl,str(params['onclick'][begin_id:begin_id+end_id]))
+
+		def handle_data(self, data):
+			if self.td_counter == 3:
+				if not self.current_item.has_key('name'):
+					self.current_item['name'] = ''
+				self.current_item['name']+= data.strip()
+			if self.td_counter == 4:
+				if not self.current_item.has_key('size'):
+					self.current_item['size'] = ''
+				self.current_item['size']+= data.strip()
+			if self.td_counter == 5:
+				if not self.current_item.has_key('seeds'):
+					self.current_item['seeds'] = ''
+				self.current_item['seeds']+= data.strip()
+			if self.td_counter == 6:
+				if not self.current_item.has_key('leech'):
+					self.current_item['leech'] = ''
+				self.current_item['leech']+= data.strip()
+
+		def start_td(self,attr):
+			if isinstance(self.td_counter,int):
+				self.td_counter += 1
+				if self.td_counter > 7:
+					self.td_counter = None
+					# add item to results
+					if self.current_item:
+						self.current_item['engine_url'] = self.url
+						self.current_item['size']= anySizeToBytes(self.current_item['size'])
+						if not self.current_item.has_key('seeds') or not self.current_item['seeds'].isdigit():
+							self.current_item['seeds'] = 0
+						if not self.current_item.has_key('leech') or not self.current_item['leech'].isdigit():
+							self.current_item['leech'] = 0
+						prettyPrinter(self.current_item)
+						self.results.append('a')
+
+	def __init__(self):
+		self.results = []
+		self.parser = self.SimpleSGMLParser(self.results, self.url)
+
+	def search(self, what):
+		i = 1
+		while True:
+			results = []
+			parser = self.SimpleSGMLParser(results, self.url)
+			dat = urllib.urlopen(self.url+'/torrents.php?ihq=%s&ihp=%s'%(what,i)).read().decode('utf-8', 'replace')
+			parser.feed(dat)
+			parser.close()
+			if len(results) <= 0:
+				break
+			i += 1
+
+class EngineLauncher(threading.Thread):
+	def __init__(self, engine, what):
+		threading.Thread.__init__(self)
+		self.engine = engine
+		self.what = what
+	def run(self):
+		self.engine.search(self.what)
+
+if __name__ == '__main__':
+	available_engines_list = BtJunkie, MegaNova, Mininova, PirateBay, Reactor, Isohunt
+
+	if len(sys.argv) < 3:
+		raise SystemExit('./nova.py <all|engine1[,engine2]*> <keywords>\navailable engines: %s'%
+				(','.join(e.__name__ for e in available_engines_list)))
+
+	engines_list = [e.lower() for e in sys.argv[1].strip().split(',')]
+	what = '+'.join(sys.argv[2:])
+
+	if 'all' in engines_list:
+		engines_list = [e.__name__.lower() for e in available_engines_list]
+
+	selected_engines = set(e for e in available_engines_list if e.__name__.lower() in engines_list)
+
+	for engine in selected_engines:
+		try:
+			if THREADED:
+				EngineLauncher( engine(), what ).start()
+			else:
+				engine().search(what)
+		except:
+			if STANDALONE:
+				traceback.print_exc()