From 541f5b048eff8cef937d9541c9827916408324ec Mon Sep 17 00:00:00 2001
From: Peter Mosmans <support@go-forward.net>
Date: Sat, 24 Jun 2017 11:44:10 +1000
Subject: [PATCH] Change spelling back end to use pyenchant

Pyenchant is a more abstract spelling back end, and can utilize aspell, as well as
hunspell.
Use titlecase module to validate whether titles are properly title cased.

Note that this change uses two new Python libraries, which should be installed:
- pyenchant
- titlecase
---
 chatops/python/validate_report.py | 116 +++++++++++-------------------
 1 file changed, 43 insertions(+), 73 deletions(-)

diff --git a/chatops/python/validate_report.py b/chatops/python/validate_report.py
index affba9c..5de18e6 100644
--- a/chatops/python/validate_report.py
+++ b/chatops/python/validate_report.py
@@ -32,23 +32,25 @@ import sys
 import textwrap
 import xml.sax
 
+
 try:
     from lxml import etree as ElementTree
+    from titlecase import titlecase
+    import enchant
 except ImportError as exception:
-    print('[-] This script needs lxml',
+    print('[-] This script needs the lxml, pyenchant and titlecase libary ({0}'.format(exception),
+          file=sys.stderr)
+    print("    Install requirements using pip install -r requirements.txt",
           file=sys.stderr)
-    print("Install lxml with: sudo pip install lxml", file=sys.stderr)
     sys.exit(-1)
 
 
 # When set to True, the report will be validated using docbuilder
 DOCBUILDER = False
-VOCABULARY = 'project-vocabulary.pws'
+UPPERCASE = ['TCP', 'UDP', 'XSS']
+VOCABULARY = 'project-vocabulary.txt'
 # Snippets may contain XML fragments without the proper entities
 EXAMPLEDIR = 'examples/'
-NOT_CAPITALIZED = ['a', 'an', 'and', 'as', 'at', 'but', 'by', 'for', 'in',
-                   'jQuery', 'jQuery-UI', 'nor', 'of', 'on', 'or', 'the', 'to',
-                   'up']
 SNIPPETDIR = 'snippets/'
 STATUS = 25 # loglevel for 'generic' status messages
 TEMPLATEDIR = 'templates/'
@@ -58,15 +60,6 @@ WARN_LINE = 80  # There should be a separation character after x characters...
 MAX_LINE = 86  # ... and before y
 
 
-if DOCBUILDER:
-    import docbuilder_proxy
-    import proxy_vagrant
-try:
-    import aspell
-except ImportError:
-    print('[-] aspell not installed: spelling not available',)
-
-
 class LogFormatter(logging.Formatter):
     """
     Format log messages according to their type.
@@ -130,34 +123,14 @@ the Free Software Foundation, either version 3 of the License, or
     return vars(parser.parse_args())
 
 
-def initialize_speller():
-    """
-    Initialize and return speller module.
-    """
-    speller = None
-    try:
-        speller = aspell.Speller(('lang', 'en'),
-                                 ('personal-dir', '.'),
-                                 ('personal', VOCABULARY))
-    except aspell.AspellConfigError as exception:  # some versions of aspell use a different path
-        logging.debug('Encountered exception when trying to intialize spelling: %s',
-                      exception)
-        try:
-            speller = aspell.Speller(('lang', 'en'),
-                                     ('personal-path', './' + VOCABULARY))
-        except aspell.AspellSpellerError as exception:
-            logging.error('Could not initialize speller: %s', exception)
-    if speller:
-        [logging.debug('%s %s', i[0], i[2]) for i in speller.ConfigKeys()]
-    return speller
-
-
-def validate_spelling(tree, filename, options, speller):
+def validate_spelling(tree, filename, options):
     """
     Check spelling of text within tags.
     If options['learn'], then unknown words will be added to the dictionary.
     """
     result = True
+    learn = []
+    speller = enchant.DictWithPWL("en_US", VOCABULARY)
     if not speller:
         options['spelling'] = False
         return result
@@ -168,17 +141,20 @@ def validate_spelling(tree, filename, options, speller):
                section.tag not in ('a', 'code', 'monospace', 'pre'):
                 for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)', section.text):
                     if not speller.check(word):
-                        if options['learn']:
-                            speller.addtoPersonal(word)
-                        else:
-                            result = False
-                            logging.warning('Misspelled (unknown) word %s in %s',
-                                            word.encode('utf-8'), filename)
-        if options['learn']:
-            speller.saveAllwords()
-    except aspell.AspellSpellerError as exception:
-        logging.error('Disabled spelling (%s)', exception)
-        options['spelling'] = False
+                        if word.upper() not in (learned.upper() for learned in learn):
+                            learn.append(word)
+                        result = False
+                        logging.warning('Misspelled (unknown) word %s in %s',
+                                        word.encode('utf-8'), filename)
+    except:
+        print('[-] Hmm. spell exception')
+    if options['learn'] and learn:
+        try:
+            with open(VOCABULARY, mode='a+') as open_file:
+                for word in learn:
+                    open_file.write(word + '\n')
+        except IOError:
+            logging.error('Could not write to %s', open_file)
     return result
 
 
@@ -220,7 +196,6 @@ def validate_files(filenames, options):
     findings = []
     non_findings = []
     scans = []
-    speller = initialize_speller()
     for filename in filenames:
         if (filename.lower().endswith('.xml') or
                 filename.lower().endswith('xml"')):
@@ -229,7 +204,7 @@ def validate_files(filenames, options):
                    (REPORT in filename and not options['no_report']):
                     masters.append(filename)
                     # try:
-                type_result, xml_type = validate_xml(filename, options, speller)
+                type_result, xml_type = validate_xml(filename, options)
                 result = result and type_result
                 if 'non-finding' in xml_type:
                     non_findings.append(filename)
@@ -255,7 +230,7 @@ def validate_report():
     return proxy_vagrant.execute_command(host, command)
 
 
-def validate_xml(filename, options, speller):
+def validate_xml(filename, options):
     """
     Validates XML file by trying to parse it.
     Returns True if the file validated successfully.
@@ -270,7 +245,7 @@ def validate_xml(filename, options, speller):
         with open(filename, 'rb') as xml_file:
             xml.sax.parse(xml_file, xml.sax.ContentHandler())
             tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False))
-            type_result, xml_type = validate_type(tree, filename, options, speller)
+            type_result, xml_type = validate_type(tree, filename, options)
             result = validate_long_lines(tree, filename, options) and result and type_result
         if options['edit'] and not result:
             open_editor(filename)
@@ -295,29 +270,24 @@ def get_all_text(node):
     return text_string.strip()
 
 
+def abbreviations(word, **kwargs):
+    """
+    Check whether word needs to be all caps
+    """
+    if word.upper() in (UPPERCASE):
+        return word.upper()
+
+
 def is_capitalized(line):
     """
     Checks whether all words in @line start with a capital.
 
     Returns True if that's the case.
     """
-    return not line or line.strip() == capitalize(line)
+    return not line or line.strip() == titlecase(line, callback=abbreviations).strip()
 
 
-def capitalize(line):
-    """
-    Returns a capitalized version of @line, where the first word and all other
-    words not in NOT_CAPITALIZED are capitalized.
-    """
-    capitalized = ''
-    for word in line.strip().split():
-        if word not in NOT_CAPITALIZED or not len(capitalized):
-            word = word[0].upper() + word[1:]
-        capitalized += word + ' '
-    return capitalized.strip()
-
-
-def validate_type(tree, filename, options, speller):
+def validate_type(tree, filename, options):
     """
     Performs specific checks based on type.
     Currently only finding and non-finding are supported.
@@ -329,7 +299,7 @@ def validate_type(tree, filename, options, speller):
     attributes = []
     tags = []
     if options['spelling']:
-        result = validate_spelling(tree, filename, options, speller)
+        result = validate_spelling(tree, filename, options)
     if xml_type == 'pentest_report':
         attributes = ['findingCode']
     if xml_type == 'finding':
@@ -362,7 +332,7 @@ def validate_type(tree, filename, options, speller):
                 print('[A] Type missing capitalization (expected {0}, read {1})'.
                       format(capitalize(root.attrib[attribute]),
                              root.attrib[attribute]))
-                root.attrib[attribute] = capitalize(root.attrib[attribute])
+                root.attrib[attribute] = titlecase(root.attrib[attribute], callback=abbreviations)
                 fix = True
     for tag in tags:
         if root.find(tag) is None:
@@ -376,9 +346,9 @@ def validate_type(tree, filename, options, speller):
         if tag == 'title' and (options['capitalization'] and \
                                not is_capitalized(root.find(tag).text)):
             print('[A] Title missing capitalization in {0} (expected {1}, read {2})'.
-                  format(filename, capitalize(root.find(tag).text),
-                         root.find(tag).text))
-            root.find(tag).text = capitalize(root.find(tag).text)
+                  format(filename, titlecase(root.find(tag).text, callback=abbreviations).strip(),
+                         root.find(tag).text.strip()))
+            root.find(tag).text = titlecase(root.find(tag).text, callback=abbreviations)
             fix = True
         all_text = get_all_text(root.find(tag))
         if tag == 'description' and all_text.strip()[-1] != '.':