"""Module for performing adaptive sampling $Id$ """ # Copyright 2008-2011 eGovMon # This program is distributed under the terms of the GNU General # Public License. # # This file is part of the eGovernment Monitoring # (eGovMon) # # eGovMon is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # eGovMon is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with eGovMon; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301 USA __author__ = "$Author$" __version__ = "$Revision$" __updated__ = "$LastChangedDate$" from math import * #from pygsl.statistics import * import RDF import time import pdb if not __name__ == 'AdaptiveSampling.adaptivesampling': #Hack for making documentation generation possible from re import compile,DOTALL import sc from samplingerror import * global samplenum samplenum = 0 global sitedictionary sitedictionary = {} global siteresultdict siteresultdict = {} global siteerrorcount siteerrorcount = 0 global sitepasscount sitepasscount = 0 #Precompiled regular expressions if not __name__ == 'AdaptiveSampling.adaptivesampling': rdfid = compile('rdf:ID=\".+?\"') earlresult = compile(r'earl:result rdf:ID.+?strawman\#....',DOTALL) def isFloat(value): try: float(value) return True except ValueError: return False def translateDict(value): """Translates the WAMIDs to URLs to make quicker lookups possible Keyword arguments: value -- WAM ID not as URL returns WAM ID as URL Examples: >>> translateDict('EIAO:B10.11.01.001.001-001') 'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.11.01.001.001-001' >>> translateDict('Imergo:B10.10.01.001.001-001.001') 'http://www.eiao.net/rdf/1.0/#www.fit.fraunhofer.de.ergebnisse.imergo.Imergo.B10.10.01.001.001-001.001' """ eiaourl = 'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.' imergourl = 'http://www.eiao.net/rdf/1.0/#www.fit.fraunhofer.de.ergebnisse.imergo.Imergo.' return value.replace('EIAO:',eiaourl).replace('Imergo:',imergourl) def removeTail(value): """Removes the tail of a string to retrieve the actual ID keyword arguments: value -- WAM ID with tail returns WAM ID without tail Example: >>> removeTail('http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.09.01.001.001-001.001-V402382') 'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.09.01.001.001-001' """ return value[:value.rfind('.')] #Should this really be commented out? #if not __name__ == 'AdaptiveSampling.adaptivesampling': # global cwamvalues # sc = sc.SystemConfiguration() # cwamvalues = dict([(translateDict(a[0]),float(a[1])) for a in [line.split('\t') for line in open(sc.barriercomputationfile).readlines()] if isFloat(a[1])]) def dynamic(reacherrmarg,site,earlsamples,egovmondb,minscenario,writeresult=True): """Perform dynamic sampling Keyword arguments: reacherrmarg -- Error margin to reach site -- The site this sample is related to earlsample -- Complete EARL for this one sample writeresult -- If the aggregated results should be counted as being part of the site. """ global cwamvalues global sitedictionary if not sitedictionary.has_key(site): sitedictionary[site] = [] earlsample = ''.join(earlsamples) if writeresult: sitedictionary[site] = sitedictionary[site]+[getEARLCWAM2(earlsample)] errmargin = 1 num = 0 avg,stddev,errmargin = getAvgCWAM(site) else: errmargin = 1 num = 0 avg,stddev,errmargin = getAvgCWAM(site,sitedictionary[site]+[getEARLCWAM2(earlsample)],egovmondb.getsamplecount()) if egovmondb.getsamplecount()>=int(sc.SystemConfiguration().minsamplecount): return True,avg,stddev,errmargin else: return False,avg,stddev,errmargin #End of FIXME if errmargin>reacherrmarg or egovmondb.getsamplecount()<=minscenario: return False,avg,stddev,errmargin else: return True,avg,stddev,errmargin def getAvgCWAM(site,cwams=[],samplecount=0): """Returns CWAM value for samples Keyword argurments: samples -- All samples to be aggregated returns average, standard deviation and error margin """ #Note: This is called directly! #Used for getting the final site result in the end #Returns average from brps algorithm, and sd/err-marg from average over pages #average = 0 #standarddev = 0 #errmarg = 0 #global samplenum #samplenum = 0 #try: # if not cwams: # cwams = [sample for sample in sitedictionary[site] if sample] #except KeyError: # raise NoSiteInformationError(site) #average = mean(cwams) global sitepasscount global siteerrorcount if not range(siteerrorcount)+range(sitepasscount): raise NoSiteInformationError(site) errorlist = [1 for i in range(siteerrorcount)] passlist = [0 for i in range(sitepasscount)] average = getBarrierRatioPerSite(sitepasscount,siteerrorcount) #stddev = sd_m(cwams,average) #stddev = sd_m(errorlist+passlist,average) stddev = 0 errmarg = 0 #if cwams: # #standarddev = variance(cwams)#sd_m(cwams,average) # #errmarg = 1.96*stddev/(sqrt(len(cwams))) # #errmarg = 2.326*stddev/(sqrt(len(cwams))) # if sitepasscount+siteerrorcount>0: # errmarg = 0 # #errmarg = ci*stddev/(sqrt(sitepasscount+siteerrorcount)) # #errmarg = 2.326*stddev/(sqrt(sitepasscount+siteerrorcount)) # else: # errmarg = 0 #else: # standarddev = 0 # errmarg = 0 return average,stddev,errmarg def getEARLCWAM2(EARL, scorealgorithm="brps"): """Getting CWAM Value of an EARL Sample without usering RedLand Keyword arguments: EARL -- Valid EARL as Strign scorealgorithm -- the algorithm for calculating score: "uwem", "brpp", "brps", "brdpp_s", "brdps_s", "brdpp_p", "brdps_p" Returns CWAM value of the EARL sample Example: >>> earl = '' >>> print getEARLCWAM2(earl) 0.5 """ #Performing adaptive sampling using simple regular expressions instead of redland for parsing, made the changed the speed to 1.39s from 106.6s. In other words almost 100 times performence increase. Because of this getEARLCWAM2 is preferable to use instead of getEARLCWAM. pre = time.time() if scorealgorithm=="uwem": sample = list(set([''.join(['http://www.eiao.net/rdf/1.0/#',removeTail(rdfid.findall(s)[0][8:-1])]) for s in earlresult.findall(EARL) if s.endswith('fail')])) #f = open('/var/log/eiao/time.log','a') #f.write('Getting samples (adaptive sampling):'+str(time.time()-pre) + '\n') #f.close() return getCWAM(sample) else: errorcount = EARL.count('nmg-strawman#fail') passcount = EARL.count('nmg-strawman#pass') #f = open('/var/log/eiao/time.log','a') #f.write('Getting samples (adaptive sampling):'+str(time.time()-pre) + '\n') #f.close() #Save the samples for overall site calculation and return page result global siteerrorcount global sitepasscount siteerrorcount = siteerrorcount + errorcount sitepasscount = sitepasscount + passcount return getBarrierRatioPerPage(passcount, errorcount) def getEARLCWAM(EARL): """Getting CWAM Value of an EARL Sample Keyword arguments: EARL -- Valid EARL as Strign Returns CWAM value of the EARL sample Example: >>> earl = '' >>> print getEARLCWAM(earl) 0.0975 """ model = RDF.Model() pars = RDF.RDFXMLParser() pars.parse_string_into_model(model, EARL, base_uri="http://www.eiao.net/rdf/1.0#") stat = RDF.Statement(subject = None, predicate = None, object = RDF.Uri('http://www.w3.org/WAI/ER/EARL/nmg-strawman#fail')) statements = model.find_statements(stat) sample = [removeTail(str(statement.subject.uri)) for statement in statements] return getCWAM(sample) def getCWAM(sample): """Returns CWAM value for one sample Keyword arguments: sample -- One Sample """ #Old deprecated function, only for old UWEM score algorithm calculation. pre = time.time() temp = 1 for result in sample: #Temporary fix to accept unknown IDs temp *= (1-cwamvalues.get(result,0.02)) #f = open('/var/log/eiao/time.log','a') #f.write('Getting CWAM:'+str(time.time()-pre)+'\n') #f.close() return 1-temp # New functions for testing of score functions def getBarrierRatioPerSite(sitepasscount,siteerrorcount): if siteerrorcount + sitepasscount == 0: return 0 return siteerrorcount / float(siteerrorcount + sitepasscount) def getBarrierRatioPerPage(passcount, errorcount): if errorcount + passcount == 0: return 0 return errorcount / float(errorcount + passcount) def get_D_S(siteresultdict, errorcount): """Barrier diversity of web site""" tmp = 0 for result in siteresultdict: tmp = tmp + ((siteresultdict[result][1] / float(errorcount)) * (siteresultdict[result][1] / float(errorcount))) return 1 - tmp