#!/usr/bin/env python """ Cross-checks findings, validates XML files, offerte and report files. This script is part of the PenText framework https://pentext.org Copyright (C) 2015-2017 Radically Open Security https://www.radicallyopensecurity.com Author: Peter Mosmans This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. """ from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals import argparse import logging import mmap import os import re import subprocess import sys import textwrap import xml.sax try: from lxml import etree as ElementTree except ImportError as exception: print('[-] This script needs lxml', file=sys.stderr) print("Install lxml with: sudo pip install lxml", file=sys.stderr) sys.exit(-1) # When set to True, the report will be validated using docbuilder DOCBUILDER = False VOCABULARY = 'project-vocabulary.pws' # Snippets may contain XML fragments without the proper entities EXAMPLEDIR = 'examples/' NOT_CAPITALIZED = ['a', 'an', 'and', 'as', 'at', 'but', 'by', 'for', 'in', 'jQuery', 'jQuery-UI', 'nor', 'of', 'on', 'or', 'the', 'to', 'up'] SNIPPETDIR = 'snippets/' STATUS = 25 # loglevel for 'generic' status messages TEMPLATEDIR = 'templates/' OFFERTE = '/offerte.xml' REPORT = '/report.xml' WARN_LINE = 80 # There should be a separation character after x characters... MAX_LINE = 86 # ... and before y if DOCBUILDER: import docbuilder_proxy import proxy_vagrant try: import aspell except ImportError: print('[-] aspell not installed: spelling not available',) class LogFormatter(logging.Formatter): """ Format log messages according to their type. """ # DEBUG = (10) debug status messages # INFO = (20) verbose status messages # STATUS = (25) generic status messages # WARNING = (30) warning messages (= errors in validation) # ERROR = (40) error messages (= program errors) FORMATS = {logging.DEBUG :"DEBUG: %(module)s: %(lineno)d: %(message)s", logging.INFO : "[*] %(message)s", STATUS : "[+] %(message)s", logging.WARN : "[-] %(message)s", logging.ERROR : "ERROR: %(message)s", 'DEFAULT' : "%(message)s"} def format(self, record): self._fmt = self.FORMATS.get(record.levelno, self.FORMATS['DEFAULT']) return logging.Formatter.format(self, record) def parse_arguments(): """ Parses command line arguments. """ parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent('''\ validate_report - validates offer letters and reports Copyright (C) 2015-2017 Radically Open Security (Peter Mosmans) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.''')) parser.add_argument('-a', '--all', action='store_true', help='Perform all checks') parser.add_argument('--auto-fix', action='store_true', help='Try to automatically correct issues') parser.add_argument('-c', '--capitalization', action='store_true', help='Check capitalization') parser.add_argument('--debug', action='store_true', help='Show debug information') parser.add_argument('--edit', action='store_true', help='Open files with issues using an editor') parser.add_argument('--learn', action='store_true', help='Store all unknown words in dictionary file') parser.add_argument('--long', action='store_true', help='Check for long lines') parser.add_argument('--offer', action='store_true', help='Validate offer master file') parser.add_argument('--spelling', action='store_true', help='Check spelling') parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity') parser.add_argument('--no-report', action='store_true', help='Do not validate report master file') parser.add_argument('--quiet', action='store_true', help='Don\'t output status messages') return vars(parser.parse_args()) def initialize_speller(): """ Initialize and return speller module. """ speller = None try: speller = aspell.Speller(('lang', 'en'), ('personal-dir', '.'), ('personal', VOCABULARY)) except aspell.AspellConfigError as exception: # some versions of aspell use a different path logging.debug('Encountered exception when trying to intialize spelling: %s', exception) try: speller = aspell.Speller(('lang', 'en'), ('personal-path', './' + VOCABULARY)) except aspell.AspellSpellerError as exception: logging.error('Could not initialize speller: %s', exception) if speller: [logging.debug('%s %s', i[0], i[2]) for i in speller.ConfigKeys()] return speller def validate_spelling(tree, filename, options, speller): """ Check spelling of text within tags. If options['learn'], then unknown words will be added to the dictionary. """ result = True if not speller: options['spelling'] = False return result try: root = tree.getroot() for section in root.iter(): if section.text and isinstance(section.tag, basestring) and \ section.tag not in ('a', 'code', 'monospace', 'pre'): for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)', section.text): if not speller.check(word): if options['learn']: speller.addtoPersonal(word) else: result = False logging.warning('Misspelled (unknown) word %s in %s', word.encode('utf-8'), filename) if options['learn']: speller.saveAllwords() except aspell.AspellSpellerError as exception: logging.error('Disabled spelling (%s)', exception) options['spelling'] = False return result def all_files(): """ Returns a list of all files contained in the git repository. """ cmd = ['git', 'ls-files'] process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return process.stdout.read().splitlines() def open_editor(filename): """ Open editor with file to edit. """ if sys.platform in ('linux', 'linux2'): editor = os.getenv('EDITOR') if editor: print('{0} {1}'.format(editor, filename)) sys.stdout.flush() subprocess.call([editor, '"{0}"'.format(filename)], shell=True) else: subprocess.call('xdg-open', filename) elif sys.platform == "darwin": subprocess.call(['open', filename]) elif sys.platform == "win32": os.system('"{0}"'.format(filename.replace('/', os.path.sep))) def validate_files(filenames, options): """ Checks file extensions and calls appropriate validator function. Returns True if all files validated succesfully. """ result = True masters = [] findings = [] non_findings = [] scans = [] speller = initialize_speller() for filename in filenames: if (filename.lower().endswith('.xml') or filename.lower().endswith('xml"')): if SNIPPETDIR not in filename and TEMPLATEDIR not in filename: if (OFFERTE in filename and options['offer']) or \ (REPORT in filename and not options['no_report']): masters.append(filename) # try: type_result, xml_type = validate_xml(filename, options, speller) result = result and type_result if 'non-finding' in xml_type: non_findings.append(filename) else: if 'finding' in xml_type: findings.append(filename) else: if 'scans' in xml_type: scans.append(filename) if len(masters): for master in masters: result = validate_master(master, findings, non_findings, scans, options) and result return result def validate_report(): """ Validates XML report file by trying to build it. Returns True if the report was built successful. """ host, command = docbuilder_proxy.read_config(docbuilder_proxy.CONFIG_FILE) command = command + ' -c' return proxy_vagrant.execute_command(host, command) def validate_xml(filename, options, speller): """ Validates XML file by trying to parse it. Returns True if the file validated successfully. """ result = True xml_type = '' # crude check whether the file is outside the pentext framework if 'notes' in filename: return result, xml_type logging.info('Validating XML file: %s', filename) try: with open(filename, 'rb') as xml_file: xml.sax.parse(xml_file, xml.sax.ContentHandler()) tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False)) type_result, xml_type = validate_type(tree, filename, options, speller) result = validate_long_lines(tree, filename, options) and result and type_result if options['edit'] and not result: open_editor(filename) except (xml.sax.SAXException, ElementTree.ParseError) as exception: print('[-] validating {0} failed ({1})'.format(filename, exception)) result = False except IOError as exception: print('[-] validating {0} failed ({1})'.format(filename, exception)) result = False return result, xml_type def get_all_text(node): """ Retrieves all text within tags. """ text_string = node.text or '' for element in node: text_string += get_all_text(element) if node.tail: text_string += node.tail return text_string.strip() def is_capitalized(line): """ Checks whether all words in @line start with a capital. Returns True if that's the case. """ return not line or line.strip() == capitalize(line) def capitalize(line): """ Returns a capitalized version of @line, where the first word and all other words not in NOT_CAPITALIZED are capitalized. """ capitalized = '' for word in line.strip().split(): if word not in NOT_CAPITALIZED or not len(capitalized): word = word[0].upper() + word[1:] capitalized += word + ' ' return capitalized.strip() def validate_type(tree, filename, options, speller): """ Performs specific checks based on type. Currently only finding and non-finding are supported. """ result = True fix = False root = tree.getroot() xml_type = root.tag attributes = [] tags = [] if options['spelling']: result = validate_spelling(tree, filename, options, speller) if xml_type == 'pentest_report': attributes = ['findingCode'] if xml_type == 'finding': attributes = ['threatLevel', 'type', 'id'] tags = ['title', 'description', 'technicaldescription', 'impact', 'recommendation'] if xml_type == 'non-finding': attributes = ['id'] tags = ['title'] if not len(attributes): return result, xml_type for attribute in attributes: if attribute not in root.attrib: print('[A] Missing obligatory attribute in {0}: {1}'. format(filename, attribute)) if attribute == 'id': root.set(attribute, filename) fix = True else: result = False else: if attribute == 'threatLevel' and root.attrib[attribute] not in \ ('Low', 'Moderate', 'Elevated', 'High', 'Extreme'): print('[-] threatLevel is not Low, Moderate, High, Elevated or Extreme: {0} {1}'. format(filename, root.attrib[attribute])) result = False if attribute == 'type' and (options['capitalization'] and not \ is_capitalized(root.attrib[attribute])): print('[A] Type missing capitalization (expected {0}, read {1})'. format(capitalize(root.attrib[attribute]), root.attrib[attribute])) root.attrib[attribute] = capitalize(root.attrib[attribute]) fix = True for tag in tags: if root.find(tag) is None: logging.warning('Missing tag in %s: %s', filename, tag) result = False continue if not get_all_text(root.find(tag)): logging.warning('Empty tag in %s: %s', filename, tag) result = False continue if tag == 'title' and (options['capitalization'] and \ not is_capitalized(root.find(tag).text)): print('[A] Title missing capitalization in {0} (expected {1}, read {2})'. format(filename, capitalize(root.find(tag).text), root.find(tag).text)) root.find(tag).text = capitalize(root.find(tag).text) fix = True all_text = get_all_text(root.find(tag)) if tag == 'description' and all_text.strip()[-1] != '.': print('[A] Description missing final dot in {0}: {1}'.format(filename, all_text)) root.find(tag).text = all_text.strip() + '.' fix = True if fix: if options['auto_fix']: print('[+] Automatically fixed {0}'.format(filename)) tree.write(filename) else: print('[+] NOTE: Items with [A] can be fixed automatically, use --auto-fix') return (result and not fix), xml_type def validate_long_lines(tree, filename, options): """ Checks whether pre or code section contains lines longer than MAX_LINE characters Returns True if the file validated successfully. """ if not options['long']: return True result = True fix = False root = tree.getroot() for pre_section in [j for section in ('pre', 'code') for j in root.iter(section)]: if pre_section.text: fixed_text = '' for line in pre_section.text.splitlines(): while len(line) > MAX_LINE: result = False print('[-] {0} Line inside {1} too long: {2}'. format(filename, section, line.encode('utf-8')[MAX_LINE:])) cutpoint = MAX_LINE for split in [' ', '"', '\'', '=', '-', ';']: if split in line.encode('utf-8')[WARN_LINE:MAX_LINE]: cutpoint = line.find(split, WARN_LINE, MAX_LINE) fix = True fixed_line = line[:cutpoint] + '\n' print('cutted line {0}'.format(line)) line = line[cutpoint:] fixed_text += fixed_line.encode('utf-8') print('[A] can be fixed (breaking at {0}): {1}'.format(cutpoint, fixed_line)) fixed_text += line + '\n' if fix and options['auto_fix']: print('[+] Automatically fixed {0}'.format(filename)) pre_section.text = fixed_text print(fixed_text) tree.write(filename) close_file(filename) return result def validate_master(filename, findings, non_findings, scans, options): """ Validates master file. """ result = True include_findings = [] include_nonfindings = [] logging.info('Validating master file %s', filename) try: xmltree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False)) if not find_keyword(xmltree, 'TODO', filename): print('[-] Keyword checks failed for {0}'.format(filename)) result = False logging.info('Performing cross check on findings, non-findings and scans...') for finding in findings: if not cross_check_file(filename, finding): print('[A] Cross check failed for finding {0}'. format(finding)) include_findings.append(finding) result = False for non_finding in non_findings: if not cross_check_file(filename, non_finding): logging.warning('Cross check failed for non-finding %s', non_finding) include_nonfindings.append(non_finding) result = False if result: logging.info('Cross checks successful') except (ElementTree.ParseError, IOError) as exception: logging.warning('Validating %s failed: %s', filename, exception) result = False if not result: if options['auto_fix']: add_include(filename, 'findings', include_findings) add_include(filename, 'nonFindings', include_nonfindings) close_file(filename) logging.info('Automatically fixed %s', filename) else: logging.warning('Item can be fixed automatically, use --auto-fix') return result def report_string(report_file): """ Return the report_file into a big memory mapped string. """ try: report = open(report_file) return mmap.mmap(report.fileno(), 0, access=mmap.ACCESS_READ) except IOError as exception: logging.critical('Could not open %s: %s', report_file, exception) sys.exit(-1) def cross_check_file(filename, external): """ Checks whether filename contains a cross-check to the file external. """ result = True report_text = report_string(filename) if report_text.find(external) == -1: logging.warning('Could not find a reference in %s to %s', filename, external) result = False return result def add_include(filename, identifier, findings): """ Adds XML include based on the identifier ('findings' or 'nonFindings'). """ tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False)) root = tree.getroot() for section in tree.iter('section'): if section.attrib['id'] == identifier: finding_section = section if finding_section is not None: for finding in findings: new_finding = ElementTree.XML(''.format(finding)) finding_section.append(new_finding) tree.write(filename, encoding="utf-8", xml_declaration=True, pretty_print=True) def close_file(filename): """ Replace placeholder with proper XML include. """ f = open(filename, 'r') filedata = f.read() f.close() newdata = filedata.replace("placeholderinclude", "xi:include") fileout = filename f = open(fileout, 'w') f.write(newdata) f.close() tree = ElementTree.parse(filename, ElementTree.XMLParser(strip_cdata=False)) tree.write(filename, encoding="utf-8", xml_declaration=True, pretty_print=True) def find_keyword(xmltree, keyword, filename): """ Finds keywords in an XML tree. This function needs lots of TLC. """ result = True section = '' for tag in xmltree.iter(): if tag.tag == 'section' and' id' in tag.attrib: section = 'in {0}'.format(tag.attrib['id']) if tag.text: if keyword in tag.text: logging.warning('%s found in %s %s', keyword, filename, section) result = False return result def setup_logging(options): """ Set up loghandlers according to options. """ logger = logging.getLogger() logger.setLevel(0) console = logging.StreamHandler(stream=sys.stdout) console.setFormatter(LogFormatter()) if options['debug']: console.setLevel(logging.DEBUG) else: if options['verbose']: console.setLevel(logging.INFO) else: logger.setLevel(STATUS) logger.addHandler(console) def main(): """ The main program. Cross-checks, validates XML files and report. Returns True if the checks were successful. """ # we want to print pretty Unicode characters reload(sys) sys.setdefaultencoding('utf-8') options = parse_arguments() setup_logging(options) if options['all']: options['capitalization'] = True options['long'] = True if options['learn']: logging.debug('Adding unknown words to %s', VOCABULARY) logging.info('Validating all XML files...') result = validate_files(all_files(), options) if result: if DOCBUILDER: logging.info('Validating report build...') result = validate_report() and result if result: logging.log(STATUS, 'Validation checks successful. Good to go') else: logging.warning('Validation failed') if options['spelling'] and options['learn']: logging.log(STATUS, 'Don\'t forget to check the vocabulary file %s', VOCABULARY) if __name__ == "__main__": main()