################################################################################ # # Bayesian Filter - 3.23.09 - Joe Mariglio # #=============================================================================== # # classify text documents. it's slow but it works. # # supply as many docs as you want. # ################################################################################ import sys, string, os.path, array, math class Category: count = {} total = 0 name = "" def train(self, word): if word in self.count.keys(): self.count[word] +=1 else: self.count[word] = 1 self.total+=1 def percentage(self, word): if word in self.count.keys(): return self.count[word] / float(self.total) else: return 0.01 / total def score(self, src, totalAll): score_val = 0 for word in src: percent = self.percentage(word) score_val += math.log(percent) score_val += math.log(self.total / float(totalAll)) return score_val def relevance(self, word, categories): per_sum = 0 for cat in categories: per_sum += cat.percentage(word) return (percentage(self, word) / per_sum) class Classifier: cats = [] uniq = [] def addCat(self, fname): cat = Category() cat.name = fname infile = open(fname, "r") lines = infile.readlines() infile.close() for line in lines: words = line.split(" ") for word in words: cat.train(word) # if word not in self.uniq: # self.uniq.append(word) self.cats.append(cat) def main(argv): c = Classifier() cat_w_t = 0 for name in sys.argv[1:]: c.addCat(name) for cat in c.cats: cat_w_t += cat.total for cat in c.cats: score = cat.score(c.uniq, cat_w_t) print(cat.name + ": " + str(score)) if __name__ == '__main__': import sys, string, os.path, random, array main(sys.argv[1:])