Change spelling back end to use pyenchant

Pyenchant is a more abstract spelling back end, and can utilize aspell, as well as
hunspell.
Use titlecase module to validate whether titles are properly title cased.

Note that this change uses two new Python libraries, which should be installed:
- pyenchant
- titlecase
This commit is contained in:
Peter Mosmans
2017-06-24 11:44:10 +10:00
parent efc76b994d
commit 87176f9bbb

View File

@@ -32,23 +32,25 @@ import sys
import textwrap import textwrap
import xml.sax import xml.sax
try: try:
from lxml import etree as ElementTree from lxml import etree as ElementTree
from titlecase import titlecase
import enchant
except ImportError as exception: except ImportError as exception:
print('[-] This script needs lxml', print('[-] This script needs the lxml, pyenchant and titlecase libary ({0}'.format(exception),
file=sys.stderr)
print(" Install requirements using pip install -r requirements.txt",
file=sys.stderr) file=sys.stderr)
print("Install lxml with: sudo pip install lxml", file=sys.stderr)
sys.exit(-1) sys.exit(-1)
# When set to True, the report will be validated using docbuilder # When set to True, the report will be validated using docbuilder
DOCBUILDER = False DOCBUILDER = False
VOCABULARY = 'project-vocabulary.pws' UPPERCASE = ['TCP', 'UDP', 'XSS']
VOCABULARY = 'project-vocabulary.txt'
# Snippets may contain XML fragments without the proper entities # Snippets may contain XML fragments without the proper entities
EXAMPLEDIR = 'examples/' EXAMPLEDIR = 'examples/'
NOT_CAPITALIZED = ['a', 'an', 'and', 'as', 'at', 'but', 'by', 'for', 'in',
'jQuery', 'jQuery-UI', 'nor', 'of', 'on', 'or', 'the', 'to',
'up']
SNIPPETDIR = 'snippets/' SNIPPETDIR = 'snippets/'
STATUS = 25 # loglevel for 'generic' status messages STATUS = 25 # loglevel for 'generic' status messages
TEMPLATEDIR = 'templates/' TEMPLATEDIR = 'templates/'
@@ -58,15 +60,6 @@ WARN_LINE = 80 # There should be a separation character after x characters...
MAX_LINE = 86 # ... and before y MAX_LINE = 86 # ... and before y
if DOCBUILDER:
import docbuilder_proxy
import proxy_vagrant
try:
import aspell
except ImportError:
print('[-] aspell not installed: spelling not available',)
class LogFormatter(logging.Formatter): class LogFormatter(logging.Formatter):
""" """
Format log messages according to their type. Format log messages according to their type.
@@ -130,34 +123,14 @@ the Free Software Foundation, either version 3 of the License, or
return vars(parser.parse_args()) return vars(parser.parse_args())
def initialize_speller(): def validate_spelling(tree, filename, options):
"""
Initialize and return speller module.
"""
speller = None
try:
speller = aspell.Speller(('lang', 'en'),
('personal-dir', '.'),
('personal', VOCABULARY))
except aspell.AspellConfigError as exception: # some versions of aspell use a different path
logging.debug('Encountered exception when trying to intialize spelling: %s',
exception)
try:
speller = aspell.Speller(('lang', 'en'),
('personal-path', './' + VOCABULARY))
except aspell.AspellSpellerError as exception:
logging.error('Could not initialize speller: %s', exception)
if speller:
[logging.debug('%s %s', i[0], i[2]) for i in speller.ConfigKeys()]
return speller
def validate_spelling(tree, filename, options, speller):
""" """
Check spelling of text within tags. Check spelling of text within tags.
If options['learn'], then unknown words will be added to the dictionary. If options['learn'], then unknown words will be added to the dictionary.
""" """
result = True result = True
learn = []
speller = enchant.DictWithPWL("en_US", VOCABULARY)
if not speller: if not speller:
options['spelling'] = False options['spelling'] = False
return result return result
@@ -168,17 +141,20 @@ def validate_spelling(tree, filename, options, speller):
section.tag not in ('a', 'code', 'monospace', 'pre'): section.tag not in ('a', 'code', 'monospace', 'pre'):
for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)', section.text): for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)', section.text):
if not speller.check(word): if not speller.check(word):
if options['learn']: if word.upper() not in (learned.upper() for learned in learn):
speller.addtoPersonal(word) learn.append(word)
else: result = False
result = False logging.warning('Misspelled (unknown) word %s in %s',
logging.warning('Misspelled (unknown) word %s in %s', word.encode('utf-8'), filename)
word.encode('utf-8'), filename) except:
if options['learn']: print('[-] Hmm. spell exception')
speller.saveAllwords() if options['learn'] and learn:
except aspell.AspellSpellerError as exception: try:
logging.error('Disabled spelling (%s)', exception) with open(VOCABULARY, mode='a+') as open_file:
options['spelling'] = False for word in learn:
open_file.write(word + '\n')
except IOError:
logging.error('Could not write to %s', open_file)
return result return result
@@ -220,7 +196,6 @@ def validate_files(filenames, options):
findings = [] findings = []
non_findings = [] non_findings = []
scans = [] scans = []
speller = initialize_speller()
for filename in filenames: for filename in filenames:
if (filename.lower().endswith('.xml') or if (filename.lower().endswith('.xml') or
filename.lower().endswith('xml"')): filename.lower().endswith('xml"')):
@@ -229,7 +204,7 @@ def validate_files(filenames, options):
(REPORT in filename and not options['no_report']): (REPORT in filename and not options['no_report']):
masters.append(filename) masters.append(filename)
# try: # try:
type_result, xml_type = validate_xml(filename, options, speller) type_result, xml_type = validate_xml(filename, options)
result = result and type_result result = result and type_result
if 'non-finding' in xml_type: if 'non-finding' in xml_type:
non_findings.append(filename) non_findings.append(filename)
@@ -255,7 +230,7 @@ def validate_report():
return proxy_vagrant.execute_command(host, command) return proxy_vagrant.execute_command(host, command)
def validate_xml(filename, options, speller): def validate_xml(filename, options):
""" """
Validates XML file by trying to parse it. Validates XML file by trying to parse it.
Returns True if the file validated successfully. Returns True if the file validated successfully.
@@ -270,7 +245,7 @@ def validate_xml(filename, options, speller):
with open(filename, 'rb') as xml_file: with open(filename, 'rb') as xml_file:
xml.sax.parse(xml_file, xml.sax.ContentHandler()) xml.sax.parse(xml_file, xml.sax.ContentHandler())
tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False)) tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False))
type_result, xml_type = validate_type(tree, filename, options, speller) type_result, xml_type = validate_type(tree, filename, options)
result = validate_long_lines(tree, filename, options) and result and type_result result = validate_long_lines(tree, filename, options) and result and type_result
if options['edit'] and not result: if options['edit'] and not result:
open_editor(filename) open_editor(filename)
@@ -295,29 +270,24 @@ def get_all_text(node):
return text_string.strip() return text_string.strip()
def abbreviations(word, **kwargs):
"""
Check whether word needs to be all caps
"""
if word.upper() in (UPPERCASE):
return word.upper()
def is_capitalized(line): def is_capitalized(line):
""" """
Checks whether all words in @line start with a capital. Checks whether all words in @line start with a capital.
Returns True if that's the case. Returns True if that's the case.
""" """
return not line or line.strip() == capitalize(line) return not line or line.strip() == titlecase(line, callback=abbreviations).strip()
def capitalize(line): def validate_type(tree, filename, options):
"""
Returns a capitalized version of @line, where the first word and all other
words not in NOT_CAPITALIZED are capitalized.
"""
capitalized = ''
for word in line.strip().split():
if word not in NOT_CAPITALIZED or not len(capitalized):
word = word[0].upper() + word[1:]
capitalized += word + ' '
return capitalized.strip()
def validate_type(tree, filename, options, speller):
""" """
Performs specific checks based on type. Performs specific checks based on type.
Currently only finding and non-finding are supported. Currently only finding and non-finding are supported.
@@ -329,7 +299,7 @@ def validate_type(tree, filename, options, speller):
attributes = [] attributes = []
tags = [] tags = []
if options['spelling']: if options['spelling']:
result = validate_spelling(tree, filename, options, speller) result = validate_spelling(tree, filename, options)
if xml_type == 'pentest_report': if xml_type == 'pentest_report':
attributes = ['findingCode'] attributes = ['findingCode']
if xml_type == 'finding': if xml_type == 'finding':
@@ -362,7 +332,7 @@ def validate_type(tree, filename, options, speller):
print('[A] Type missing capitalization (expected {0}, read {1})'. print('[A] Type missing capitalization (expected {0}, read {1})'.
format(capitalize(root.attrib[attribute]), format(capitalize(root.attrib[attribute]),
root.attrib[attribute])) root.attrib[attribute]))
root.attrib[attribute] = capitalize(root.attrib[attribute]) root.attrib[attribute] = titlecase(root.attrib[attribute], callback=abbreviations)
fix = True fix = True
for tag in tags: for tag in tags:
if root.find(tag) is None: if root.find(tag) is None:
@@ -376,9 +346,9 @@ def validate_type(tree, filename, options, speller):
if tag == 'title' and (options['capitalization'] and \ if tag == 'title' and (options['capitalization'] and \
not is_capitalized(root.find(tag).text)): not is_capitalized(root.find(tag).text)):
print('[A] Title missing capitalization in {0} (expected {1}, read {2})'. print('[A] Title missing capitalization in {0} (expected {1}, read {2})'.
format(filename, capitalize(root.find(tag).text), format(filename, titlecase(root.find(tag).text, callback=abbreviations).strip(),
root.find(tag).text)) root.find(tag).text.strip()))
root.find(tag).text = capitalize(root.find(tag).text) root.find(tag).text = titlecase(root.find(tag).text, callback=abbreviations)
fix = True fix = True
all_text = get_all_text(root.find(tag)) all_text = get_all_text(root.find(tag))
if tag == 'description' and all_text.strip()[-1] != '.': if tag == 'description' and all_text.strip()[-1] != '.':