"""Module for performing adaptive sampling
$Id$
"""
# Copyright 2008-2011 eGovMon
# This program is distributed under the terms of the GNU General
# Public License.
#
# This file is part of the eGovernment Monitoring
# (eGovMon)
#
# eGovMon is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# eGovMon is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with eGovMon; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301 USA
__author__ = "$Author$"
__version__ = "$Revision$"
__updated__ = "$LastChangedDate$"
from math import *
#from pygsl.statistics import *
import RDF
import time
import pdb
if not __name__ == 'AdaptiveSampling.adaptivesampling':
#Hack for making documentation generation possible
from re import compile,DOTALL
import sc
from samplingerror import *
global samplenum
samplenum = 0
global sitedictionary
sitedictionary = {}
global siteresultdict
siteresultdict = {}
global siteerrorcount
siteerrorcount = 0
global sitepasscount
sitepasscount = 0
#Precompiled regular expressions
if not __name__ == 'AdaptiveSampling.adaptivesampling':
rdfid = compile('rdf:ID=\".+?\"')
earlresult = compile(r'earl:result rdf:ID.+?strawman\#....',DOTALL)
def isFloat(value):
try:
float(value)
return True
except ValueError:
return False
def translateDict(value):
"""Translates the WAMIDs to URLs to make quicker lookups possible
Keyword arguments:
value -- WAM ID not as URL
returns WAM ID as URL
Examples:
>>> translateDict('EIAO:B10.11.01.001.001-001')
'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.11.01.001.001-001'
>>> translateDict('Imergo:B10.10.01.001.001-001.001')
'http://www.eiao.net/rdf/1.0/#www.fit.fraunhofer.de.ergebnisse.imergo.Imergo.B10.10.01.001.001-001.001'
"""
eiaourl = 'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.'
imergourl = 'http://www.eiao.net/rdf/1.0/#www.fit.fraunhofer.de.ergebnisse.imergo.Imergo.'
return value.replace('EIAO:',eiaourl).replace('Imergo:',imergourl)
def removeTail(value):
"""Removes the tail of a string to retrieve the actual ID
keyword arguments:
value -- WAM ID with tail
returns WAM ID without tail
Example:
>>> removeTail('http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.09.01.001.001-001.001-V402382')
'http://www.eiao.net/rdf/1.0/#www.eiao.net.1.0.RelaxedWAM.EIAO.B10.09.01.001.001-001'
"""
return value[:value.rfind('.')]
#Should this really be commented out?
#if not __name__ == 'AdaptiveSampling.adaptivesampling':
# global cwamvalues
# sc = sc.SystemConfiguration()
# cwamvalues = dict([(translateDict(a[0]),float(a[1])) for a in [line.split('\t') for line in open(sc.barriercomputationfile).readlines()] if isFloat(a[1])])
def dynamic(reacherrmarg,site,earlsamples,egovmondb,minscenario,writeresult=True):
"""Perform dynamic sampling
Keyword arguments:
reacherrmarg -- Error margin to reach
site -- The site this sample is related to
earlsample -- Complete EARL for this one sample
writeresult -- If the aggregated results should be counted as being part of the site.
"""
global cwamvalues
global sitedictionary
if not sitedictionary.has_key(site):
sitedictionary[site] = []
earlsample = ''.join(earlsamples)
if writeresult:
sitedictionary[site] = sitedictionary[site]+[getEARLCWAM2(earlsample)]
errmargin = 1
num = 0
avg,stddev,errmargin = getAvgCWAM(site)
else:
errmargin = 1
num = 0
avg,stddev,errmargin = getAvgCWAM(site,sitedictionary[site]+[getEARLCWAM2(earlsample)],egovmondb.getsamplecount())
if egovmondb.getsamplecount()>=int(sc.SystemConfiguration().minsamplecount):
return True,avg,stddev,errmargin
else:
return False,avg,stddev,errmargin
#End of FIXME
if errmargin>reacherrmarg or egovmondb.getsamplecount()<=minscenario:
return False,avg,stddev,errmargin
else:
return True,avg,stddev,errmargin
def getAvgCWAM(site,cwams=[],samplecount=0):
"""Returns CWAM value for samples
Keyword argurments:
samples -- All samples to be aggregated
returns average, standard deviation and error margin
"""
#Note: This is called directly!
#Used for getting the final site result in the end
#Returns average from brps algorithm, and sd/err-marg from average over pages
#average = 0
#standarddev = 0
#errmarg = 0
#global samplenum
#samplenum = 0
#try:
# if not cwams:
# cwams = [sample for sample in sitedictionary[site] if sample]
#except KeyError:
# raise NoSiteInformationError(site)
#average = mean(cwams)
global sitepasscount
global siteerrorcount
if not range(siteerrorcount)+range(sitepasscount):
raise NoSiteInformationError(site)
errorlist = [1 for i in range(siteerrorcount)]
passlist = [0 for i in range(sitepasscount)]
average = getBarrierRatioPerSite(sitepasscount,siteerrorcount)
#stddev = sd_m(cwams,average)
#stddev = sd_m(errorlist+passlist,average)
stddev = 0
errmarg = 0
#if cwams:
# #standarddev = variance(cwams)#sd_m(cwams,average)
# #errmarg = 1.96*stddev/(sqrt(len(cwams)))
# #errmarg = 2.326*stddev/(sqrt(len(cwams)))
# if sitepasscount+siteerrorcount>0:
# errmarg = 0
# #errmarg = ci*stddev/(sqrt(sitepasscount+siteerrorcount))
# #errmarg = 2.326*stddev/(sqrt(sitepasscount+siteerrorcount))
# else:
# errmarg = 0
#else:
# standarddev = 0
# errmarg = 0
return average,stddev,errmarg
def getEARLCWAM2(EARL, scorealgorithm="brps"):
"""Getting CWAM Value of an EARL Sample without usering RedLand
Keyword arguments:
EARL -- Valid EARL as Strign
scorealgorithm -- the algorithm for calculating score: "uwem", "brpp", "brps", "brdpp_s", "brdps_s", "brdpp_p", "brdps_p"
Returns CWAM value of the EARL sample
Example:
>>> earl = ''
>>> print getEARLCWAM2(earl)
0.5
"""
#Performing adaptive sampling using simple regular expressions instead of redland for parsing, made the changed the speed to 1.39s from 106.6s. In other words almost 100 times performence increase. Because of this getEARLCWAM2 is preferable to use instead of getEARLCWAM.
pre = time.time()
if scorealgorithm=="uwem":
sample = list(set([''.join(['http://www.eiao.net/rdf/1.0/#',removeTail(rdfid.findall(s)[0][8:-1])]) for s in earlresult.findall(EARL) if s.endswith('fail')]))
#f = open('/var/log/eiao/time.log','a')
#f.write('Getting samples (adaptive sampling):'+str(time.time()-pre) + '\n')
#f.close()
return getCWAM(sample)
else:
errorcount = EARL.count('nmg-strawman#fail')
passcount = EARL.count('nmg-strawman#pass')
#f = open('/var/log/eiao/time.log','a')
#f.write('Getting samples (adaptive sampling):'+str(time.time()-pre) + '\n')
#f.close()
#Save the samples for overall site calculation and return page result
global siteerrorcount
global sitepasscount
siteerrorcount = siteerrorcount + errorcount
sitepasscount = sitepasscount + passcount
return getBarrierRatioPerPage(passcount, errorcount)
def getEARLCWAM(EARL):
"""Getting CWAM Value of an EARL Sample
Keyword arguments:
EARL -- Valid EARL as Strign
Returns CWAM value of the EARL sample
Example:
>>> earl = ''
>>> print getEARLCWAM(earl)
0.0975
"""
model = RDF.Model()
pars = RDF.RDFXMLParser()
pars.parse_string_into_model(model, EARL, base_uri="http://www.eiao.net/rdf/1.0#")
stat = RDF.Statement(subject = None, predicate = None, object = RDF.Uri('http://www.w3.org/WAI/ER/EARL/nmg-strawman#fail'))
statements = model.find_statements(stat)
sample = [removeTail(str(statement.subject.uri)) for statement in statements]
return getCWAM(sample)
def getCWAM(sample):
"""Returns CWAM value for one sample
Keyword arguments:
sample -- One Sample
"""
#Old deprecated function, only for old UWEM score algorithm calculation.
pre = time.time()
temp = 1
for result in sample:
#Temporary fix to accept unknown IDs
temp *= (1-cwamvalues.get(result,0.02))
#f = open('/var/log/eiao/time.log','a')
#f.write('Getting CWAM:'+str(time.time()-pre)+'\n')
#f.close()
return 1-temp
# New functions for testing of score functions
def getBarrierRatioPerSite(sitepasscount,siteerrorcount):
if siteerrorcount + sitepasscount == 0:
return 0
return siteerrorcount / float(siteerrorcount + sitepasscount)
def getBarrierRatioPerPage(passcount, errorcount):
if errorcount + passcount == 0:
return 0
return errorcount / float(errorcount + passcount)
def get_D_S(siteresultdict, errorcount):
"""Barrier diversity of web site"""
tmp = 0
for result in siteresultdict:
tmp = tmp + ((siteresultdict[result][1] / float(errorcount)) * (siteresultdict[result][1] / float(errorcount)))
return 1 - tmp