# -*- coding: UTF-8 -*- import re import os import string import math import random import time from postlistserror import * from stopwords import * from webextract import * #Parameters numdocuments = 100000 numbertoremove = 0#30 algorithmstoinclude = ['nb']#['maxlike','neighbor'] #maxlike - Maximum Likelyhood #neighbor - Nearest Neighbor #nb - naive bayes def sortfreq(x,y): return cmp(y[1],x[1]) def getPriorityList(page,domain,type): c_wordfrequency = {} for word in getAllData(open(page).read(),domain,type): c_wordfrequency[word] = c_wordfrequency.get(word,0)+1 #c_wordfrequency = removemunicipalitywords(removestopwords(removenonascii(c_wordfrequency))).items() c_wordfrequency = removestopwords(removenonascii(c_wordfrequency)).items() c_wordfrequency.sort(sortfreq) c_wordfrequency = [i[0] for i in c_wordfrequency][:50] return c_wordfrequency p_dates = [] n_dates = [] p_uniquedates = [] n_uniquedates = [] p_numbers = [] n_numbers = [] p_months = [] n_months = [] p_days = [] n_days = [] p_word = [] n_word = [] p_linktext = [] n_linktext = [] p_wordfrequency = {} n_wordfrequency = {} p_wordfrequency2 = {} n_wordfrequency2 = {} sites = [] n_neighbor = [] p_neighbor = [] n_nb = [] p_nb = [] if __name__=='__main__': for file in os.listdir('./postlists/'): if os.path.isfile(os.path.join('./postlists/'+file)) and os.path.isfile(os.path.join('./notpostlists/'+file)): #Site sites.append(file) if 'nb' in algorithmstoinclude: p_nb += getAllData(open(os.path.join('./postlists/'+file)).read(),file,'p') n_nb += getAllData(open(os.path.join('./notpostlists/'+file)).read(),file,'n') if 'maxlike' in algorithmstoinclude: #Dates p_dates.append(len(getAllProperdates(open(os.path.join('./postlists/'+file)).read()))) n_dates.append(len(getAllProperdates(open(os.path.join('./notpostlists/'+file)).read()))) #Unique dates p_uniquedates.append(len(getUniqueDates(open(os.path.join('./postlists/'+file)).read()))) n_uniquedates.append(len(getUniqueDates(open(os.path.join('./notpostlists/'+file)).read()))) #Month Numbers p_numbers.append(len(getAllMonths(open(os.path.join('./postlists/'+file)).read()))) n_numbers.append(len(getAllMonths(open(os.path.join('./notpostlists/'+file)).read()))) #Moth Names p_months.append(len(getMonthNames(open(os.path.join('./postlists/'+file)).read()))) n_months.append(len(getAllMonths(open(os.path.join('./notpostlists/'+file)).read()))) #Common words p_word.append(len(getCommonWords(open(os.path.join('./postlists/'+file)).read()))) n_word.append(len(getCommonWords(open(os.path.join('./notpostlists/'+file)).read()))) #Week days p_days.append(len(getWeekdays(open(os.path.join('./postlists/'+file)).read()))) n_days.append(len(getWeekdays(open(os.path.join('./notpostlists/'+file)).read()))) #Text links p_linktext.append(len(getTextLinks(open(os.path.join('./postlists/'+file)).read()))) n_linktext.append(len(getTextLinks(open(os.path.join('./notpostlists/'+file)).read()))) if 'neighbor' in algorithmstoinclude: p_thisfreq = getPriorityList(os.path.join('./postlists/'+file)) if p_thisfreq: p_neighbor.append(p_thisfreq) n_thisfreq = getPriorityList(os.path.join('./notpostlists/'+file)) if n_thisfreq: n_neighbor.append(n_thisfreq) p_thisfreq = {} if 'maxlike' in algorithmstoinclude: for word in getAllData(open(os.path.join('./postlists/'+file),file,'p').read()): p_wordfrequency[word] = p_wordfrequency.get(word,0)+1 #if 'neighbor' in algorithmstoinclude: # p_thisfreq[word] = p_thisfreq.get(word,0)+1 if p_thisfreq: p_neighbor.append(p_thisfreq) n_thisfreq = {} if 'maxlike' in algorithmstoinclude: for word in getAllData(open(os.path.join('./notpostlists/'+file),file,'n').read()): n_wordfrequency[word] = n_wordfrequency.get(word,0)+1 #if 'neighbor' in algorithmstoinclude: # n_thisfreq[word] = n_thisfreq.get(word,0)+1 if n_thisfreq: n_neighbor.append(n_thisfreq) if 'nb' in algorithmstoinclude: #Removing the word postlist and postjournal and innsyn from training data. for unwanted in ('classheadlinelink','kommune','kommunens','2009','idwebparttitlebannermenupartuten','forsiden','span','3span','a','href','navn'): while unwanted in p_nb: p_nb.remove(unwanted) while unwanted in n_nb: n_nb.remove(unwanted) #if len(p_nb)prob_n: return 'p',1-(prob_p/(prob_p+prob_n)),res_p,time.time()-pre#(math.e**prob_p)/(math.e**prob_n+math.e**prob_p) else: return 'n',1-(prob_n/(prob_p+prob_n)),res_n,time.time()-pre#(math.e**prob_n)/(math.e**prob_n+math.e**prob_p) def getNearest(testvector,this_p_neighbor,this_n_neighbor,number=25,ignoreclosest=False): num_n = 0 num_p = 0 real_p = [] real_n = [] #print 'Starting real_p' for p_freq in this_p_neighbor: real_p.append(getDistance(testvector,p_freq)) #print real_p[-1], #print real_p#testvector,real_p #print 'Ending real_p' for n_freq in this_n_neighbor: real_n.append(getDistance(testvector,n_freq)) real_pn = [(i,'p') for i in real_p]+[(i,'n') for i in real_n] del real_p[:] del real_n[:] real_pn.sort(sortn) if ignoreclosest: real_pn=real_pn[1:number+1] else: real_pn=real_pn[:number] num_n = len([i for i in real_pn if i[1]=='n']) num_p = len([i for i in real_pn if i[1]=='p']) if num_n>num_p: return real_pn,'n' else: return real_pn,'p' if 'maxlike' in algorithmstoinclude: n_wordfrequency = [i[0] for i in n_wordfrequency][:50] p_wordfrequency = [i[0] for i in p_wordfrequency][:50] #Actual classification if __name__=='__main__': print 'Type Site Mail_Record Not_Mail_Record Correct' ntrue_max = 0 nfalse_max = 0 ntrue_nei = 0 nfalse_nei = 0 ntrue_nb = 0 nfalse_nb = 0 if __name__=='__main__': for file in os.listdir('./postlists/')[:numdocuments]: if os.path.isfile(os.path.join('./postlists/'+file)) and os.path.isfile(os.path.join('./notpostlists/'+file)): #Postlists c_wordfrequency = getPriorityList('./postlists/'+file,file,'p') if c_wordfrequency: print 'Correct:p',file, if 'maxlike' in algorithmstoinclude: p = getDistance(c_wordfrequency,p_wordfrequency) n = getDistance(c_wordfrequency,n_wordfrequency) print p,n,pn, if 'neighbor' in algorithmstoinclude: distance,neighbor = getNearest(c_wordfrequency,p_neighbor,n_neighbor,ignoreclosest=True) print neighbor,neighbor=='n',#distance, if 'nb' in algorithmstoinclude: result,prob,res,thistime = getNB(getAllData(open(os.path.join('./notpostlists/'+file)).read(),file,'n'),p_nb,n_nb,ignoreclosest=True,correct='n',testvectoropposite=getAllData(open(os.path.join('./postlists/'+file)).read(),file,'p'),numbertoremove=numbertoremove) print 'Classified:',result,prob,result=='n','.'+' '.join(res)+'.', f = open('duration_onlytitle','a') f.write(str(thistime)+'\n') f.close() for word in res: n_wordfrequency2[word.strip()] = n_wordfrequency2.get(word.strip(),0)+1 print if 'maxlike' in algorithmstoinclude: if p>n: ntrue_max += 1 else: nfalse_max += 1 if 'neighbor' in algorithmstoinclude: if neighbor=='n': ntrue_nei +=1 else: nfalse_nei +=1 if 'nb' in algorithmstoinclude: if result=='n': ntrue_nb +=1 else: nfalse_nb +=1 if 'nb' in algorithmstoinclude: p_wordfrequency2 = p_wordfrequency2.items() n_wordfrequency2 = n_wordfrequency2.items() p_wordfrequency2.sort(sortfreq) n_wordfrequency2.sort(sortfreq) allwords = float(sum([i[1] for i in p_wordfrequency2])) f = open('nb_wordmailrecord','w') f.write(' word freq norm\n') i = 1 for word,freq in p_wordfrequency2[:10]: f.write(str(i) + ' ' +word + ' ' +str(freq) +' '+str(freq/allwords)+'\n') i += 1 f.close() allwords = float(sum([i[1] for i in p_wordfrequency2])) f = open('nb_wordnomailrecord','w') f.write(' word freq norm\n') i = 1 for word,freq in n_wordfrequency2[:10]: f.write(str(i) + ' ' +word + ' ' +str(freq) +' '+str(freq/allwords)+'\n') i += 1 f.close() if __name__=='__main__': if 'maxlike' in algorithmstoinclude: print 'Maximum likelyhood' print ' correct:',ntrue_max,float(ntrue_max)/(ntrue_max+nfalse_max) print ' incorrect:',nfalse_max,float(nfalse_max)/(ntrue_max+nfalse_max) if 'neighbor' in algorithmstoinclude: print 'Nearest Neighbor' print ' correct:',ntrue_nei,float(ntrue_nei)/(ntrue_nei+nfalse_nei) print ' incorrect:',nfalse_nei,float(nfalse_nei)/(ntrue_nei+nfalse_nei) if 'nb' in algorithmstoinclude: print 'Naïve Bayes' print ' correct:',ntrue_nb,float(ntrue_nb)/(ntrue_nb+nfalse_nb) print ' incorrect:',nfalse_nb,float(nfalse_nb)/(ntrue_nb+nfalse_nb) print ' number of training data',ntrue_nb+nfalse_nb-numbertoremove*2-1