"""List of stopwords Id$ """ # Copyright 2008,2009 eGovMon # This program is distributed under the terms of the GNU General # Public License. # # This file is part of the eGovernment Monitoring # (eGovMon) # # eGovMon is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # eGovMon is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with eGovMon; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301 USA __author__ = "$Author$" __version__ = "$Revision$" __updated__ = "$LastChangedDate$" import string stopwords = [i.strip() for i in open('stopwords').readlines()]+[i.strip() for i in open('htmlwords').readlines()]+[i.strip() for i in open('areas').readlines()] nonasciiwords = ['-','=','}','{','+','/',':','|','>',',','"','\'','&',')','('] municipalitywords = [] def removemunicipalitywords(input): for word in municipalitywords: try: input.pop(word) except KeyError: pass return input def heavystrip(input): retwords = {} for word,value in input.items(): retword = '' for character in word.lower(): if character in string.ascii_lowercase + '1234567890øæå': retword += character if retword: retwords[retword]=value return retwords def removestopwords(input): if type(input)==type([]): for word in stopwords: while word in input: input.remove(word) return input for word in stopwords: try: input.pop(word) except KeyError: pass return input def removenonascii(input): for word in input.keys(): if not word.strip(): input.pop(word) for nonascii in nonasciiwords: if nonascii in word: try: input.pop(word) except KeyError: pass isok = False for value in string.ascii_lowercase + '1234567890': if value in word: isok = True if not isok: try: input.pop(word) except KeyError: pass return input