pentext/chatops/python/validate_report.py
Peter Mosmans 541f5b048e Change spelling back end to use pyenchant
Pyenchant is a more abstract spelling back end, and can utilize aspell, as well as
hunspell.
Use titlecase module to validate whether titles are properly title cased.

Note that this change uses two new Python libraries, which should be installed:
- pyenchant
- titlecase
2017-06-24 11:44:10 +10:00

568 lines
20 KiB
Python

#!/usr/bin/env python
"""
Cross-checks findings, validates XML files, offerte and report files.
This script is part of the PenText framework
https://pentext.org
Copyright (C) 2015-2017 Radically Open Security
https://www.radicallyopensecurity.com
Author: Peter Mosmans
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import logging
import mmap
import os
import re
import subprocess
import sys
import textwrap
import xml.sax
try:
from lxml import etree as ElementTree
from titlecase import titlecase
import enchant
except ImportError as exception:
print('[-] This script needs the lxml, pyenchant and titlecase libary ({0}'.format(exception),
file=sys.stderr)
print(" Install requirements using pip install -r requirements.txt",
file=sys.stderr)
sys.exit(-1)
# When set to True, the report will be validated using docbuilder
DOCBUILDER = False
UPPERCASE = ['TCP', 'UDP', 'XSS']
VOCABULARY = 'project-vocabulary.txt'
# Snippets may contain XML fragments without the proper entities
EXAMPLEDIR = 'examples/'
SNIPPETDIR = 'snippets/'
STATUS = 25 # loglevel for 'generic' status messages
TEMPLATEDIR = 'templates/'
OFFERTE = '/offerte.xml'
REPORT = '/report.xml'
WARN_LINE = 80 # There should be a separation character after x characters...
MAX_LINE = 86 # ... and before y
class LogFormatter(logging.Formatter):
"""
Format log messages according to their type.
"""
# DEBUG = (10) debug status messages
# INFO = (20) verbose status messages
# STATUS = (25) generic status messages
# WARNING = (30) warning messages (= errors in validation)
# ERROR = (40) error messages (= program errors)
FORMATS = {logging.DEBUG :"DEBUG: %(module)s: %(lineno)d: %(message)s",
logging.INFO : "[*] %(message)s",
STATUS : "[+] %(message)s",
logging.WARN : "[-] %(message)s",
logging.ERROR : "ERROR: %(message)s",
'DEFAULT' : "%(message)s"}
def format(self, record):
self._fmt = self.FORMATS.get(record.levelno, self.FORMATS['DEFAULT'])
return logging.Formatter.format(self, record)
def parse_arguments():
"""
Parses command line arguments.
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent('''\
validate_report - validates offer letters and reports
Copyright (C) 2015-2017 Radically Open Security (Peter Mosmans)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.'''))
parser.add_argument('-a', '--all', action='store_true',
help='Perform all checks')
parser.add_argument('--auto-fix', action='store_true',
help='Try to automatically correct issues')
parser.add_argument('-c', '--capitalization', action='store_true',
help='Check capitalization')
parser.add_argument('--debug', action='store_true',
help='Show debug information')
parser.add_argument('--edit', action='store_true',
help='Open files with issues using an editor')
parser.add_argument('--learn', action='store_true',
help='Store all unknown words in dictionary file')
parser.add_argument('--long', action='store_true',
help='Check for long lines')
parser.add_argument('--offer', action='store_true',
help='Validate offer master file')
parser.add_argument('--spelling', action='store_true',
help='Check spelling')
parser.add_argument('-v', '--verbose', action='store_true',
help='increase output verbosity')
parser.add_argument('--no-report', action='store_true',
help='Do not validate report master file')
parser.add_argument('--quiet', action='store_true',
help='Don\'t output status messages')
return vars(parser.parse_args())
def validate_spelling(tree, filename, options):
"""
Check spelling of text within tags.
If options['learn'], then unknown words will be added to the dictionary.
"""
result = True
learn = []
speller = enchant.DictWithPWL("en_US", VOCABULARY)
if not speller:
options['spelling'] = False
return result
try:
root = tree.getroot()
for section in root.iter():
if section.text and isinstance(section.tag, basestring) and \
section.tag not in ('a', 'code', 'monospace', 'pre'):
for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)', section.text):
if not speller.check(word):
if word.upper() not in (learned.upper() for learned in learn):
learn.append(word)
result = False
logging.warning('Misspelled (unknown) word %s in %s',
word.encode('utf-8'), filename)
except:
print('[-] Hmm. spell exception')
if options['learn'] and learn:
try:
with open(VOCABULARY, mode='a+') as open_file:
for word in learn:
open_file.write(word + '\n')
except IOError:
logging.error('Could not write to %s', open_file)
return result
def all_files():
"""
Returns a list of all files contained in the git repository.
"""
cmd = ['git', 'ls-files']
process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
return process.stdout.read().splitlines()
def open_editor(filename):
"""
Open editor with file to edit.
"""
if sys.platform in ('linux', 'linux2'):
editor = os.getenv('EDITOR')
if editor:
print('{0} {1}'.format(editor, filename))
sys.stdout.flush()
subprocess.call([editor, '"{0}"'.format(filename)], shell=True)
else:
subprocess.call('xdg-open', filename)
elif sys.platform == "darwin":
subprocess.call(['open', filename])
elif sys.platform == "win32":
os.system('"{0}"'.format(filename.replace('/', os.path.sep)))
def validate_files(filenames, options):
"""
Checks file extensions and calls appropriate validator function.
Returns True if all files validated succesfully.
"""
result = True
masters = []
findings = []
non_findings = []
scans = []
for filename in filenames:
if (filename.lower().endswith('.xml') or
filename.lower().endswith('xml"')):
if SNIPPETDIR not in filename and TEMPLATEDIR not in filename:
if (OFFERTE in filename and options['offer']) or \
(REPORT in filename and not options['no_report']):
masters.append(filename)
# try:
type_result, xml_type = validate_xml(filename, options)
result = result and type_result
if 'non-finding' in xml_type:
non_findings.append(filename)
else:
if 'finding' in xml_type:
findings.append(filename)
else:
if 'scans' in xml_type:
scans.append(filename)
if len(masters):
for master in masters:
result = validate_master(master, findings, non_findings, scans, options) and result
return result
def validate_report():
"""
Validates XML report file by trying to build it.
Returns True if the report was built successful.
"""
host, command = docbuilder_proxy.read_config(docbuilder_proxy.CONFIG_FILE)
command = command + ' -c'
return proxy_vagrant.execute_command(host, command)
def validate_xml(filename, options):
"""
Validates XML file by trying to parse it.
Returns True if the file validated successfully.
"""
result = True
xml_type = ''
# crude check whether the file is outside the pentext framework
if 'notes' in filename:
return result, xml_type
logging.info('Validating XML file: %s', filename)
try:
with open(filename, 'rb') as xml_file:
xml.sax.parse(xml_file, xml.sax.ContentHandler())
tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False))
type_result, xml_type = validate_type(tree, filename, options)
result = validate_long_lines(tree, filename, options) and result and type_result
if options['edit'] and not result:
open_editor(filename)
except (xml.sax.SAXException, ElementTree.ParseError) as exception:
print('[-] validating {0} failed ({1})'.format(filename, exception))
result = False
except IOError as exception:
print('[-] validating {0} failed ({1})'.format(filename, exception))
result = False
return result, xml_type
def get_all_text(node):
"""
Retrieves all text within tags.
"""
text_string = node.text or ''
for element in node:
text_string += get_all_text(element)
if node.tail:
text_string += node.tail
return text_string.strip()
def abbreviations(word, **kwargs):
"""
Check whether word needs to be all caps
"""
if word.upper() in (UPPERCASE):
return word.upper()
def is_capitalized(line):
"""
Checks whether all words in @line start with a capital.
Returns True if that's the case.
"""
return not line or line.strip() == titlecase(line, callback=abbreviations).strip()
def validate_type(tree, filename, options):
"""
Performs specific checks based on type.
Currently only finding and non-finding are supported.
"""
result = True
fix = False
root = tree.getroot()
xml_type = root.tag
attributes = []
tags = []
if options['spelling']:
result = validate_spelling(tree, filename, options)
if xml_type == 'pentest_report':
attributes = ['findingCode']
if xml_type == 'finding':
attributes = ['threatLevel', 'type', 'id']
tags = ['title', 'description', 'technicaldescription', 'impact',
'recommendation']
if xml_type == 'non-finding':
attributes = ['id']
tags = ['title']
if not len(attributes):
return result, xml_type
for attribute in attributes:
if attribute not in root.attrib:
print('[A] Missing obligatory attribute in {0}: {1}'.
format(filename, attribute))
if attribute == 'id':
root.set(attribute, filename)
fix = True
else:
result = False
else:
if attribute == 'threatLevel' and root.attrib[attribute] not in \
('Low', 'Moderate', 'Elevated', 'High', 'Extreme'):
print('[-] threatLevel is not Low, Moderate, High, Elevated or Extreme: {0} {1}'.
format(filename, root.attrib[attribute]))
result = False
if attribute == 'type' and (options['capitalization'] and not \
is_capitalized(root.attrib[attribute])):
print('[A] Type missing capitalization (expected {0}, read {1})'.
format(capitalize(root.attrib[attribute]),
root.attrib[attribute]))
root.attrib[attribute] = titlecase(root.attrib[attribute], callback=abbreviations)
fix = True
for tag in tags:
if root.find(tag) is None:
logging.warning('Missing tag in %s: %s', filename, tag)
result = False
continue
if not get_all_text(root.find(tag)):
logging.warning('Empty tag in %s: %s', filename, tag)
result = False
continue
if tag == 'title' and (options['capitalization'] and \
not is_capitalized(root.find(tag).text)):
print('[A] Title missing capitalization in {0} (expected {1}, read {2})'.
format(filename, titlecase(root.find(tag).text, callback=abbreviations).strip(),
root.find(tag).text.strip()))
root.find(tag).text = titlecase(root.find(tag).text, callback=abbreviations)
fix = True
all_text = get_all_text(root.find(tag))
if tag == 'description' and all_text.strip()[-1] != '.':
print('[A] Description missing final dot in {0}: {1}'.format(filename, all_text))
root.find(tag).text = all_text.strip() + '.'
fix = True
if fix:
if options['auto_fix']:
print('[+] Automatically fixed {0}'.format(filename))
tree.write(filename)
else:
print('[+] NOTE: Items with [A] can be fixed automatically, use --auto-fix')
return (result and not fix), xml_type
def validate_long_lines(tree, filename, options):
"""
Checks whether pre or code section contains lines longer than MAX_LINE characters
Returns True if the file validated successfully.
"""
if not options['long']:
return True
result = True
fix = False
root = tree.getroot()
for pre_section in [j for section in ('pre', 'code') for j in root.iter(section)]:
if pre_section.text:
fixed_text = ''
for line in pre_section.text.splitlines():
while len(line) > MAX_LINE:
result = False
print('[-] {0} Line inside {1} too long: {2}'.
format(filename, section, line.encode('utf-8')[MAX_LINE:]))
cutpoint = MAX_LINE
for split in [' ', '"', '\'', '=', '-', ';']:
if split in line.encode('utf-8')[WARN_LINE:MAX_LINE]:
cutpoint = line.find(split, WARN_LINE, MAX_LINE)
fix = True
fixed_line = line[:cutpoint] + '\n'
print('cutted line {0}'.format(line))
line = line[cutpoint:]
fixed_text += fixed_line.encode('utf-8')
print('[A] can be fixed (breaking at {0}): {1}'.format(cutpoint, fixed_line))
fixed_text += line + '\n'
if fix and options['auto_fix']:
print('[+] Automatically fixed {0}'.format(filename))
pre_section.text = fixed_text
print(fixed_text)
tree.write(filename)
close_file(filename)
return result
def validate_master(filename, findings, non_findings, scans, options):
"""
Validates master file.
"""
result = True
include_findings = []
include_nonfindings = []
logging.info('Validating master file %s', filename)
try:
xmltree = ElementTree.parse(filename,
ElementTree.XMLParser(strip_cdata=False))
if not find_keyword(xmltree, 'TODO', filename):
print('[-] Keyword checks failed for {0}'.format(filename))
result = False
logging.info('Performing cross check on findings, non-findings and scans...')
for finding in findings:
if not cross_check_file(filename, finding):
print('[A] Cross check failed for finding {0}'.
format(finding))
include_findings.append(finding)
result = False
for non_finding in non_findings:
if not cross_check_file(filename, non_finding):
logging.warning('Cross check failed for non-finding %s', non_finding)
include_nonfindings.append(non_finding)
result = False
if result:
logging.info('Cross checks successful')
except (ElementTree.ParseError, IOError) as exception:
logging.warning('Validating %s failed: %s', filename, exception)
result = False
if not result:
if options['auto_fix']:
add_include(filename, 'findings', include_findings)
add_include(filename, 'nonFindings', include_nonfindings)
close_file(filename)
logging.info('Automatically fixed %s', filename)
else:
logging.warning('Item can be fixed automatically, use --auto-fix')
return result
def report_string(report_file):
"""
Return the report_file into a big memory mapped string.
"""
try:
report = open(report_file)
return mmap.mmap(report.fileno(), 0, access=mmap.ACCESS_READ)
except IOError as exception:
logging.critical('Could not open %s: %s', report_file, exception)
sys.exit(-1)
def cross_check_file(filename, external):
"""
Checks whether filename contains a cross-check to the file external.
"""
result = True
report_text = report_string(filename)
if report_text.find(external) == -1:
logging.warning('Could not find a reference in %s to %s', filename, external)
result = False
return result
def add_include(filename, identifier, findings):
"""
Adds XML include based on the identifier ('findings' or 'nonFindings').
"""
tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False))
root = tree.getroot()
for section in tree.iter('section'):
if section.attrib['id'] == identifier:
finding_section = section
if finding_section is not None:
for finding in findings:
new_finding = ElementTree.XML('<placeholderinclude href="../{0}"/>'.format(finding))
finding_section.append(new_finding)
tree.write(filename, encoding="utf-8", xml_declaration=True, pretty_print=True)
def close_file(filename):
"""
Replace placeholder with proper XML include.
"""
f = open(filename, 'r')
filedata = f.read()
f.close()
newdata = filedata.replace("placeholderinclude", "xi:include")
fileout = filename
f = open(fileout, 'w')
f.write(newdata)
f.close()
tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False))
tree.write(filename, encoding="utf-8", xml_declaration=True, pretty_print=True)
def find_keyword(xmltree, keyword, filename):
"""
Finds keywords in an XML tree.
This function needs lots of TLC.
"""
result = True
section = ''
for tag in xmltree.iter():
if tag.tag == 'section' and' id' in tag.attrib:
section = 'in {0}'.format(tag.attrib['id'])
if tag.text:
if keyword in tag.text:
logging.warning('%s found in %s %s', keyword, filename, section)
result = False
return result
def setup_logging(options):
"""
Set up loghandlers according to options.
"""
logger = logging.getLogger()
logger.setLevel(0)
console = logging.StreamHandler(stream=sys.stdout)
console.setFormatter(LogFormatter())
if options['debug']:
console.setLevel(logging.DEBUG)
else:
if options['verbose']:
console.setLevel(logging.INFO)
else:
logger.setLevel(STATUS)
logger.addHandler(console)
def main():
"""
The main program. Cross-checks, validates XML files and report.
Returns True if the checks were successful.
"""
# we want to print pretty Unicode characters
reload(sys)
sys.setdefaultencoding('utf-8')
options = parse_arguments()
setup_logging(options)
if options['all']:
options['capitalization'] = True
options['long'] = True
if options['learn']:
logging.debug('Adding unknown words to %s', VOCABULARY)
logging.info('Validating all XML files...')
result = validate_files(all_files(), options)
if result:
if DOCBUILDER:
logging.info('Validating report build...')
result = validate_report() and result
if result:
logging.log(STATUS, 'Validation checks successful. Good to go')
else:
logging.warning('Validation failed')
if options['spelling'] and options['learn']:
logging.log(STATUS, 'Don\'t forget to check the vocabulary file %s', VOCABULARY)
if __name__ == "__main__":
main()