Change spelling back end to use pyenchant
Pyenchant is a more abstract spelling back end, and can utilize aspell, as well as hunspell. Use titlecase module to validate whether titles are properly title cased. Note that this change uses two new Python libraries, which should be installed: - pyenchant - titlecase
This commit is contained in:
@@ -32,23 +32,25 @@ import sys
|
||||
import textwrap
|
||||
import xml.sax
|
||||
|
||||
|
||||
try:
|
||||
from lxml import etree as ElementTree
|
||||
from titlecase import titlecase
|
||||
import enchant
|
||||
except ImportError as exception:
|
||||
print('[-] This script needs lxml',
|
||||
print('[-] This script needs the lxml, pyenchant and titlecase libary ({0}'.format(exception),
|
||||
file=sys.stderr)
|
||||
print(" Install requirements using pip install -r requirements.txt",
|
||||
file=sys.stderr)
|
||||
print("Install lxml with: sudo pip install lxml", file=sys.stderr)
|
||||
sys.exit(-1)
|
||||
|
||||
|
||||
# When set to True, the report will be validated using docbuilder
|
||||
DOCBUILDER = False
|
||||
VOCABULARY = 'project-vocabulary.pws'
|
||||
UPPERCASE = ['TCP', 'UDP', 'XSS']
|
||||
VOCABULARY = 'project-vocabulary.txt'
|
||||
# Snippets may contain XML fragments without the proper entities
|
||||
EXAMPLEDIR = 'examples/'
|
||||
NOT_CAPITALIZED = ['a', 'an', 'and', 'as', 'at', 'but', 'by', 'for', 'in',
|
||||
'jQuery', 'jQuery-UI', 'nor', 'of', 'on', 'or', 'the', 'to',
|
||||
'up']
|
||||
SNIPPETDIR = 'snippets/'
|
||||
STATUS = 25 # loglevel for 'generic' status messages
|
||||
TEMPLATEDIR = 'templates/'
|
||||
@@ -58,15 +60,6 @@ WARN_LINE = 80 # There should be a separation character after x characters...
|
||||
MAX_LINE = 86 # ... and before y
|
||||
|
||||
|
||||
if DOCBUILDER:
|
||||
import docbuilder_proxy
|
||||
import proxy_vagrant
|
||||
try:
|
||||
import aspell
|
||||
except ImportError:
|
||||
print('[-] aspell not installed: spelling not available',)
|
||||
|
||||
|
||||
class LogFormatter(logging.Formatter):
|
||||
"""
|
||||
Format log messages according to their type.
|
||||
@@ -130,34 +123,14 @@ the Free Software Foundation, either version 3 of the License, or
|
||||
return vars(parser.parse_args())
|
||||
|
||||
|
||||
def initialize_speller():
|
||||
"""
|
||||
Initialize and return speller module.
|
||||
"""
|
||||
speller = None
|
||||
try:
|
||||
speller = aspell.Speller(('lang', 'en'),
|
||||
('personal-dir', '.'),
|
||||
('personal', VOCABULARY))
|
||||
except aspell.AspellConfigError as exception: # some versions of aspell use a different path
|
||||
logging.debug('Encountered exception when trying to intialize spelling: %s',
|
||||
exception)
|
||||
try:
|
||||
speller = aspell.Speller(('lang', 'en'),
|
||||
('personal-path', './' + VOCABULARY))
|
||||
except aspell.AspellSpellerError as exception:
|
||||
logging.error('Could not initialize speller: %s', exception)
|
||||
if speller:
|
||||
[logging.debug('%s %s', i[0], i[2]) for i in speller.ConfigKeys()]
|
||||
return speller
|
||||
|
||||
|
||||
def validate_spelling(tree, filename, options, speller):
|
||||
def validate_spelling(tree, filename, options):
|
||||
"""
|
||||
Check spelling of text within tags.
|
||||
If options['learn'], then unknown words will be added to the dictionary.
|
||||
"""
|
||||
result = True
|
||||
learn = []
|
||||
speller = enchant.DictWithPWL("en_US", VOCABULARY)
|
||||
if not speller:
|
||||
options['spelling'] = False
|
||||
return result
|
||||
@@ -168,17 +141,20 @@ def validate_spelling(tree, filename, options, speller):
|
||||
section.tag not in ('a', 'code', 'monospace', 'pre'):
|
||||
for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)', section.text):
|
||||
if not speller.check(word):
|
||||
if options['learn']:
|
||||
speller.addtoPersonal(word)
|
||||
else:
|
||||
result = False
|
||||
logging.warning('Misspelled (unknown) word %s in %s',
|
||||
word.encode('utf-8'), filename)
|
||||
if options['learn']:
|
||||
speller.saveAllwords()
|
||||
except aspell.AspellSpellerError as exception:
|
||||
logging.error('Disabled spelling (%s)', exception)
|
||||
options['spelling'] = False
|
||||
if word.upper() not in (learned.upper() for learned in learn):
|
||||
learn.append(word)
|
||||
result = False
|
||||
logging.warning('Misspelled (unknown) word %s in %s',
|
||||
word.encode('utf-8'), filename)
|
||||
except:
|
||||
print('[-] Hmm. spell exception')
|
||||
if options['learn'] and learn:
|
||||
try:
|
||||
with open(VOCABULARY, mode='a+') as open_file:
|
||||
for word in learn:
|
||||
open_file.write(word + '\n')
|
||||
except IOError:
|
||||
logging.error('Could not write to %s', open_file)
|
||||
return result
|
||||
|
||||
|
||||
@@ -220,7 +196,6 @@ def validate_files(filenames, options):
|
||||
findings = []
|
||||
non_findings = []
|
||||
scans = []
|
||||
speller = initialize_speller()
|
||||
for filename in filenames:
|
||||
if (filename.lower().endswith('.xml') or
|
||||
filename.lower().endswith('xml"')):
|
||||
@@ -229,7 +204,7 @@ def validate_files(filenames, options):
|
||||
(REPORT in filename and not options['no_report']):
|
||||
masters.append(filename)
|
||||
# try:
|
||||
type_result, xml_type = validate_xml(filename, options, speller)
|
||||
type_result, xml_type = validate_xml(filename, options)
|
||||
result = result and type_result
|
||||
if 'non-finding' in xml_type:
|
||||
non_findings.append(filename)
|
||||
@@ -255,7 +230,7 @@ def validate_report():
|
||||
return proxy_vagrant.execute_command(host, command)
|
||||
|
||||
|
||||
def validate_xml(filename, options, speller):
|
||||
def validate_xml(filename, options):
|
||||
"""
|
||||
Validates XML file by trying to parse it.
|
||||
Returns True if the file validated successfully.
|
||||
@@ -270,7 +245,7 @@ def validate_xml(filename, options, speller):
|
||||
with open(filename, 'rb') as xml_file:
|
||||
xml.sax.parse(xml_file, xml.sax.ContentHandler())
|
||||
tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False))
|
||||
type_result, xml_type = validate_type(tree, filename, options, speller)
|
||||
type_result, xml_type = validate_type(tree, filename, options)
|
||||
result = validate_long_lines(tree, filename, options) and result and type_result
|
||||
if options['edit'] and not result:
|
||||
open_editor(filename)
|
||||
@@ -295,29 +270,24 @@ def get_all_text(node):
|
||||
return text_string.strip()
|
||||
|
||||
|
||||
def abbreviations(word, **kwargs):
|
||||
"""
|
||||
Check whether word needs to be all caps
|
||||
"""
|
||||
if word.upper() in (UPPERCASE):
|
||||
return word.upper()
|
||||
|
||||
|
||||
def is_capitalized(line):
|
||||
"""
|
||||
Checks whether all words in @line start with a capital.
|
||||
|
||||
Returns True if that's the case.
|
||||
"""
|
||||
return not line or line.strip() == capitalize(line)
|
||||
return not line or line.strip() == titlecase(line, callback=abbreviations).strip()
|
||||
|
||||
|
||||
def capitalize(line):
|
||||
"""
|
||||
Returns a capitalized version of @line, where the first word and all other
|
||||
words not in NOT_CAPITALIZED are capitalized.
|
||||
"""
|
||||
capitalized = ''
|
||||
for word in line.strip().split():
|
||||
if word not in NOT_CAPITALIZED or not len(capitalized):
|
||||
word = word[0].upper() + word[1:]
|
||||
capitalized += word + ' '
|
||||
return capitalized.strip()
|
||||
|
||||
|
||||
def validate_type(tree, filename, options, speller):
|
||||
def validate_type(tree, filename, options):
|
||||
"""
|
||||
Performs specific checks based on type.
|
||||
Currently only finding and non-finding are supported.
|
||||
@@ -329,7 +299,7 @@ def validate_type(tree, filename, options, speller):
|
||||
attributes = []
|
||||
tags = []
|
||||
if options['spelling']:
|
||||
result = validate_spelling(tree, filename, options, speller)
|
||||
result = validate_spelling(tree, filename, options)
|
||||
if xml_type == 'pentest_report':
|
||||
attributes = ['findingCode']
|
||||
if xml_type == 'finding':
|
||||
@@ -362,7 +332,7 @@ def validate_type(tree, filename, options, speller):
|
||||
print('[A] Type missing capitalization (expected {0}, read {1})'.
|
||||
format(capitalize(root.attrib[attribute]),
|
||||
root.attrib[attribute]))
|
||||
root.attrib[attribute] = capitalize(root.attrib[attribute])
|
||||
root.attrib[attribute] = titlecase(root.attrib[attribute], callback=abbreviations)
|
||||
fix = True
|
||||
for tag in tags:
|
||||
if root.find(tag) is None:
|
||||
@@ -376,9 +346,9 @@ def validate_type(tree, filename, options, speller):
|
||||
if tag == 'title' and (options['capitalization'] and \
|
||||
not is_capitalized(root.find(tag).text)):
|
||||
print('[A] Title missing capitalization in {0} (expected {1}, read {2})'.
|
||||
format(filename, capitalize(root.find(tag).text),
|
||||
root.find(tag).text))
|
||||
root.find(tag).text = capitalize(root.find(tag).text)
|
||||
format(filename, titlecase(root.find(tag).text, callback=abbreviations).strip(),
|
||||
root.find(tag).text.strip()))
|
||||
root.find(tag).text = titlecase(root.find(tag).text, callback=abbreviations)
|
||||
fix = True
|
||||
all_text = get_all_text(root.find(tag))
|
||||
if tag == 'description' and all_text.strip()[-1] != '.':
|
||||
|
||||
Reference in New Issue
Block a user