Change spelling back end to use pyenchant

Pyenchant is a more abstract spelling back end, and can utilize aspell, as well as hunspell. Use titlecase module to validate whether titles are properly title cased. Note that this change uses two new Python libraries, which should be installed: - pyenchant - titlecase
2017-06-24 11:44:10 +10:00
parent efc76b994d
commit 87176f9bbb
1 changed files with 43 additions and 73 deletions
--- a/chatops/python/validate_report.py
+++ b/chatops/python/validate_report.py
@@ -32,23 +32,25 @@ import sys
 import textwrap
 import xml.sax
 try:
    from lxml import etree as ElementTree
    from titlecase import titlecase
    import enchant
 except ImportError as exception:
-    print('[-] This script needs lxml',
+    print('[-] This script needs the lxml, pyenchant and titlecase libary ({0}'.format(exception),
          file=sys.stderr)
    print("    Install requirements using pip install -r requirements.txt",
          file=sys.stderr)
    print("Install lxml with: sudo pip install lxml", file=sys.stderr)
    sys.exit(-1)
 # When set to True, the report will be validated using docbuilder
 DOCBUILDER = False
-VOCABULARY = 'project-vocabulary.pws'
+UPPERCASE = ['TCP', 'UDP', 'XSS']
 VOCABULARY = 'project-vocabulary.txt'
 # Snippets may contain XML fragments without the proper entities
 EXAMPLEDIR = 'examples/'
 NOT_CAPITALIZED = ['a', 'an', 'and', 'as', 'at', 'but', 'by', 'for', 'in',
                   'jQuery', 'jQuery-UI', 'nor', 'of', 'on', 'or', 'the', 'to',
                   'up']
 SNIPPETDIR = 'snippets/'
 STATUS = 25 # loglevel for 'generic' status messages
 TEMPLATEDIR = 'templates/'
@@ -58,15 +60,6 @@ WARN_LINE = 80  # There should be a separation character after x characters...
 MAX_LINE = 86  # ... and before y
 if DOCBUILDER:
    import docbuilder_proxy
    import proxy_vagrant
 try:
    import aspell
 except ImportError:
    print('[-] aspell not installed: spelling not available',)
 class LogFormatter(logging.Formatter):
    """
    Format log messages according to their type.
@@ -130,34 +123,14 @@ the Free Software Foundation, either version 3 of the License, or
    return vars(parser.parse_args())
-def initialize_speller():
+def validate_spelling(tree, filename, options):
    """
    Initialize and return speller module.
    """
    speller = None
    try:
        speller = aspell.Speller(('lang', 'en'),
                                 ('personal-dir', '.'),
                                 ('personal', VOCABULARY))
    except aspell.AspellConfigError as exception:  # some versions of aspell use a different path
        logging.debug('Encountered exception when trying to intialize spelling: %s',
                      exception)
        try:
            speller = aspell.Speller(('lang', 'en'),
                                     ('personal-path', './' + VOCABULARY))
        except aspell.AspellSpellerError as exception:
            logging.error('Could not initialize speller: %s', exception)
    if speller:
        [logging.debug('%s %s', i[0], i[2]) for i in speller.ConfigKeys()]
    return speller
 def validate_spelling(tree, filename, options, speller):
    """
    Check spelling of text within tags.
    If options['learn'], then unknown words will be added to the dictionary.
    """
    result = True
    learn = []
    speller = enchant.DictWithPWL("en_US", VOCABULARY)
    if not speller:
        options['spelling'] = False
        return result
@@ -168,17 +141,20 @@ def validate_spelling(tree, filename, options, speller):
               section.tag not in ('a', 'code', 'monospace', 'pre'):
                for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)', section.text):
                    if not speller.check(word):
-                        if options['learn']:
+                        if word.upper() not in (learned.upper() for learned in learn):
-                            speller.addtoPersonal(word)
+                            learn.append(word)
-                        else:
+                        result = False
-                            result = False
+                        logging.warning('Misspelled (unknown) word %s in %s',
-                            logging.warning('Misspelled (unknown) word %s in %s',
+                                        word.encode('utf-8'), filename)
-                                            word.encode('utf-8'), filename)
+    except:
-        if options['learn']:
+        print('[-] Hmm. spell exception')
-            speller.saveAllwords()
+    if options['learn'] and learn:
-    except aspell.AspellSpellerError as exception:
+        try:
-        logging.error('Disabled spelling (%s)', exception)
+            with open(VOCABULARY, mode='a+') as open_file:
-        options['spelling'] = False
+                for word in learn:
                    open_file.write(word + '\n')
        except IOError:
            logging.error('Could not write to %s', open_file)
    return result
@@ -220,7 +196,6 @@ def validate_files(filenames, options):
    findings = []
    non_findings = []
    scans = []
    speller = initialize_speller()
    for filename in filenames:
        if (filename.lower().endswith('.xml') or
                filename.lower().endswith('xml"')):
@@ -229,7 +204,7 @@ def validate_files(filenames, options):
                   (REPORT in filename and not options['no_report']):
                    masters.append(filename)
                    # try:
-                type_result, xml_type = validate_xml(filename, options, speller)
+                type_result, xml_type = validate_xml(filename, options)
                result = result and type_result
                if 'non-finding' in xml_type:
                    non_findings.append(filename)
@@ -255,7 +230,7 @@ def validate_report():
    return proxy_vagrant.execute_command(host, command)
-def validate_xml(filename, options, speller):
+def validate_xml(filename, options):
    """
    Validates XML file by trying to parse it.
    Returns True if the file validated successfully.
@@ -270,7 +245,7 @@ def validate_xml(filename, options, speller):
        with open(filename, 'rb') as xml_file:
            xml.sax.parse(xml_file, xml.sax.ContentHandler())
            tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False))
-            type_result, xml_type = validate_type(tree, filename, options, speller)
+            type_result, xml_type = validate_type(tree, filename, options)
            result = validate_long_lines(tree, filename, options) and result and type_result
        if options['edit'] and not result:
            open_editor(filename)
@@ -295,29 +270,24 @@ def get_all_text(node):
    return text_string.strip()
 def abbreviations(word, **kwargs):
    """
    Check whether word needs to be all caps
    """
    if word.upper() in (UPPERCASE):
        return word.upper()
 def is_capitalized(line):
    """
    Checks whether all words in @line start with a capital.
    Returns True if that's the case.
    """
-    return not line or line.strip() == capitalize(line)
+    return not line or line.strip() == titlecase(line, callback=abbreviations).strip()
-def capitalize(line):
+def validate_type(tree, filename, options):
    """
    Returns a capitalized version of @line, where the first word and all other
    words not in NOT_CAPITALIZED are capitalized.
    """
    capitalized = ''
    for word in line.strip().split():
        if word not in NOT_CAPITALIZED or not len(capitalized):
            word = word[0].upper() + word[1:]
        capitalized += word + ' '
    return capitalized.strip()
 def validate_type(tree, filename, options, speller):
    """
    Performs specific checks based on type.
    Currently only finding and non-finding are supported.
@@ -329,7 +299,7 @@ def validate_type(tree, filename, options, speller):
    attributes = []
    tags = []
    if options['spelling']:
-        result = validate_spelling(tree, filename, options, speller)
+        result = validate_spelling(tree, filename, options)
    if xml_type == 'pentest_report':
        attributes = ['findingCode']
    if xml_type == 'finding':
@@ -362,7 +332,7 @@ def validate_type(tree, filename, options, speller):
                print('[A] Type missing capitalization (expected {0}, read {1})'.
                      format(capitalize(root.attrib[attribute]),
                             root.attrib[attribute]))
-                root.attrib[attribute] = capitalize(root.attrib[attribute])
+                root.attrib[attribute] = titlecase(root.attrib[attribute], callback=abbreviations)
                fix = True
    for tag in tags:
        if root.find(tag) is None:
@@ -376,9 +346,9 @@ def validate_type(tree, filename, options, speller):
        if tag == 'title' and (options['capitalization'] and \
                               not is_capitalized(root.find(tag).text)):
            print('[A] Title missing capitalization in {0} (expected {1}, read {2})'.
-                  format(filename, capitalize(root.find(tag).text),
+                  format(filename, titlecase(root.find(tag).text, callback=abbreviations).strip(),
-                         root.find(tag).text))
+                         root.find(tag).text.strip()))
-            root.find(tag).text = capitalize(root.find(tag).text)
+            root.find(tag).text = titlecase(root.find(tag).text, callback=abbreviations)
            fix = True
        all_text = get_all_text(root.find(tag))
        if tag == 'description' and all_text.strip()[-1] != '.':