#import xbmc, xbmcgui,urllib, re, random, string, time, os.path,Image,cookielib,urllib2 import string,getopt,sys import cookielib,urllib2 import re import time # URL for all pages, ordered by number of downloads. # Most downloaded .torrent files first. BASE_URL = 'http://www.vuze.com/content/BucketBrowse.htm?sp=X&sp=X&sp=X&sp=$PAGENUM&sp=SPOPULAR&sp=SALL&sp=X&sp=X&sp=X&sp=X&sp=X' #BASE_URL= 'http://www.vuze.com/content/BucketBrowse.htm?sp=X&sp=X&sp=X&sp=$PAGENUM&sp=SAZHOT&sp=SALL&sp=X&sp=X&sp=X&sp=X&sp=X' def parse_page(page): torrent = {} #pagefile = open(page, 'r') #file = pagefile.read() #pagefile.close() prev_title = 'unknown' for line in page.split("\n"): title = re.search('\); return false;" title="' + r'([^\"]+)' + '" href="/details', line) #title = re.search(';" title="' + r'(\w+)' + '" href="', line) if title: prev_title = re.sub( r'[\/]', '_', title.group(1)) code = re.search('span id="java_not_installed_' + r'(\w{20,})' + '" style="display: none', line) if code: url = 'http://www.vuze.com/download/' + code.group(1) + '.torrent' torrent[prev_title] = url #print url, prev_title #print line return(torrent) # example link : 'span id="java_not_installed_QSHDTXGQUL53HVQ2ZJGFYYLTQXUKJ5TT" style' # translated : http://www.vuze.com/download/QSHDTXGQUL53HVQ2ZJGFYYLTQXUKJ5TT.torrent #print file def get_page(url): cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) temp = opener.open(url) return temp.read() #print temp.read() def get_url_and_write_to_file(url, name): #print url page = get_page(url) pagefile = open(name, 'w') pagefile.write(page) pagefile.close() return(page) def get_all_on_vuze(baseurl): for num in range(6,55): print "Downloading page " + str(num) url = string.Template(BASE_URL).safe_substitute(dict(PAGENUM=str(num))) page = get_url_and_write_to_file(url,"vuze_page_" + str(num) + ".html") torrents = parse_page(page) for torrent in torrents.keys(): print " ", torrent, torrents[torrent] time.sleep(10) get_url_and_write_to_file(torrents[torrent], torrent + ".torrent") CRAWL = True try: opts, args = getopt.getopt(sys.argv[1:], "f", ["debug"]) except getopt.GetoptError: # print help information and exit: print "Unknown option" sys.exit(2) for o, a in opts: if o in ("-f", "--debug"): CRAWL = False if CRAWL: print "Crawl mode.." get_all_on_vuze(BASE_URL) else: print "No crawl mode.." parse_page()