# written by Raynor Vliegendhart # see LICENSE.txt for license information import os import sys import codecs from optparse import OptionParser from Tribler.Core.CacheDB.sqlitecachedb import SQLiteCacheDB def main(): parser = OptionParser(usage='usage: %prog [OPTION]... SDB') parser.add_option('-o', dest='output', metavar='DIR', default=os.getcwd(), help='outputdir, default: cwd') options, args = parser.parse_args() if len(args)!=1: parser.print_help() exit() filename = args[0] outputdir = options.output if not os.path.exists(filename): print "*** Cannot find '%s'" % filename sys.exit(4) if not os.path.exists(outputdir): os.mkdir(outputdir) sqlite_db_path = sys.argv[1] sqlitedb = SQLiteCacheDB.getInstance(None) sqlitedb.initDB(sqlite_db_path) sql = """ SELECT TF1.term || " " || TF2.term AS phrase, COUNT(*) AS freq FROM TorrentBiTermPhrase P, TermFrequency TF1, TermFrequency TF2 WHERE P.term1_id = TF1.term_id AND P.term2_id = TF2.term_id GROUP BY term1_id, term2_id ORDER by freq DESC; """ bigrams = sqlitedb.fetchall(sql) sql = """SELECT term, freq FROM TermFrequency WHERE freq > 1 ORDER by freq DESC;""" unigrams = sqlitedb.fetchall(sql) fh = codecs.open(os.path.join(outputdir, 'nbuzz_uni.txt'), 'w', 'utf8') for term, freq in unigrams: print >>fh, freq, term fh.close() fh = codecs.open(os.path.join(outputdir, 'nbuzz_bi.txt'), 'w', 'utf8') for phrase, freq in bigrams: print >>fh, freq, phrase fh.close() if __name__ == '__main__': main()