corteza/compose/service/values/sanitizer.go

package values

import (
	"fmt"
	"html"
	"regexp"
	"strconv"
	"strings"
	"time"

	"github.com/cortezaproject/corteza-server/pkg/expr"
	"github.com/cortezaproject/corteza-server/pkg/logger"
	"github.com/microcosm-cc/bluemonday"
	"go.uber.org/zap"

	"github.com/cortezaproject/corteza-server/compose/types"
)

type (
	sanitizer struct{}
)

// Sanitizer initializes sanitizer
//
// Not really needed, following pattern in the package
func Sanitizer() *sanitizer {
	return &sanitizer{}
}

// Run cleans up input data
//  - fix multi-value order/place index
//  - trim all the strings!
//  - parse & format input values to match field specific -- nullify/falsify invalid
//  - field kind specific, no errors raised, data is modified
//
// Existing data (when updating record) is not yet loaded at this point
func (s sanitizer) Run(m *types.Module, vv types.RecordValueSet) (out types.RecordValueSet) {
	var (
		exprParser = expr.Parser()
	)

	out = make([]*types.RecordValue, 0, len(vv))

	for _, f := range m.Fields {
		// Reorder and sanitize place value (no gaps)
		//
		// Values are ordered when received so we treat them like it
		// and assign the appropriate place no.
		var i = 0
		for _, v := range vv.FilterByName(f.Name) {
			if v.IsDeleted() {
				continue
			}

			c := v.Clone()
			c.Place = uint(i)
			out = append(out, c)
			i++
		}
	}

	var (
		f    *types.ModuleField
		kind string

		log = logger.Default().
			With(zap.Uint64("module", m.ID))
	)

	for _, v := range out {
		f = m.Fields.FindByName(v.Name)
		if f == nil {
			// Unknown field,
			// if it is not handled before,
			// sanitizer does not care about it
			continue
		}

		if f.Expressions.ValueExpr != "" {
			// do not do any sanitization if field has value expression!
			continue
		}

		if v.IsDeleted() || !v.Updated {
			// Ignore unchanged and deleted
			continue
		}

		kind = strings.ToLower(f.Kind)

		if len(f.Expressions.Sanitizers) > 0 {
			for _, expr := range f.Expressions.Sanitizers {
				rval, err := exprParser.Evaluate(expr, map[string]interface{}{"value": v.Value})
				if err != nil {
					log.Error(
						"failed to evaluate sanitizer expression",
						zap.String("field", f.Name),
						zap.String("expr", expr),
						zap.Error(err),
					)
					continue
				}
				v.Value = sanitize(f, rval)
			}
		}

		if kind != "string" {
			// Trim all but string
			v.Value = strings.TrimSpace(v.Value)
		}

		if f.IsRef() {
			if refy.MatchString(v.Value) {
				v.Ref, _ = strconv.ParseUint(v.Value, 10, 64)
			}

			if v.Ref == 0 {
				v.Value = ""
			}
		}

		// Per field type validators
		switch kind {
		case "bool":
			v.Value = sBool(v.Value)

		case "datetime":
			v.Value = sDatetime(v.Value, f.Options.Bool("onlyDate"), f.Options.Bool("onlyTime"))

		case "number":
			v.Value = sNumber(v.Value, f.Options.Precision())

		case "string":
			v.Value = sString(v.Value)

			// Uncomment when they become relevant for sanitization
			//case "email":
			//	v = s.sEmail(v, f, m)
			//case "file":
			//	v = s.sFile(v, f, m)
			//case "record":
			//	v = s.sRecord(v, f, m)
			//case "select":
			//	v = s.sSelect(v, f, m)
			//case "url":
			//	v = s.sUrl(v, f, m)
			//case "user":
			//	v = s.sUser(v, f, m)
		}
	}

	return
}

func (s sanitizer) RunXSS(m *types.Module, vv types.RecordValueSet) types.RecordValueSet {
	var (
		f *types.ModuleField
	)

	for _, v := range vv {
		f = m.Fields.FindByName(v.Name)
		if f == nil {
			// Unknown field,
			// if it is not handled before,
			// sanitizer does not care about it
			continue
		}

		switch strings.ToLower(f.Kind) {
		case "string":
			v.Value = sString(v.Value)
		}
	}

	return vv
}

func sBool(v interface{}) string {
	switch c := v.(type) {
	case bool:
		if c {
			return strBoolTrue
		}

	case string:
		if truthy.MatchString(strings.ToLower(c)) {
			return strBoolTrue
		}
	}

	return strBoolFalse
}

func sDatetime(v interface{}, onlyDate, onlyTime bool) string {
	var (
		// input format set
		inputFormats []string

		// output format
		internalFormat string

		datetime = fmt.Sprintf("%v", v)
	)

	if onlyTime {
		internalFormat = datetimeIntenralFormatTime
		inputFormats = []string{
			datetimeIntenralFormatTime,
			"15:04",
			"15:04:05Z07:00",
			"15:04:05 MST",
			"15:04:05 -0700",
			"15:04 MST",
			"15:04Z07:00",
			"15:04 -0700",
			time.Kitchen,
		}
	} else {
		if onlyDate {
			// In case only date is used, make sure we format it properly
			internalFormat = datetimeInternalFormatDate
		} else {
			internalFormat = datetimeInternalFormatFull
		}

		// date & time
		inputFormats = []string{
			datetimeInternalFormatFull,
			"2006-01-02T15:04:05", // iso8601 without timezone
			time.RFC1123Z,
			time.RFC1123,
			time.RFC822Z,
			time.RFC822,
			time.RFC850,
			time.ANSIC,
			time.UnixDate,
			time.RubyDate,
			"2006-01-02 15:04:05.999999999 -0700 MST", // Time.String()
			"2006-01-02",
			"02 Jan 2006",
			"2006-01-02T15:04:05-0700", // RFC3339 without timezone hh:mm colon
			"2006-01-02 15:04:05 -07:00",
			"2006-01-02 15:04:05 -0700",
			"2006-01-02 15:04:05Z07:00", // RFC3339 without T
			"2006-01-02 15:04:05Z0700",  // RFC3339 without T or timezone hh:mm colon
			"2006-01-02 15:04:05",
			time.Kitchen,
			time.Stamp,
			time.StampMilli,
			time.StampMicro,
			time.StampNano,
			datetimeInternalFormatDate,
			"02 Jan 06",
			"Monday, 02-Jan-06",
			"Mon, 02 Jan 2006",
			"2006/_1/_2",
		}
	}

	for _, format := range inputFormats {
		parsed, err := time.Parse(format, datetime)
		if err == nil {
			return parsed.UTC().Format(internalFormat)
		}
	}

	return ""
}

func sNumber(num interface{}, p uint) string {
	base, err := strconv.ParseFloat(fmt.Sprintf("%v", num), 64)
	if err != nil {
		return "0"
	}

	// Format the value to the desired precision
	str := strconv.FormatFloat(base, 'f', int(p), 64)

	// In case of fractures, remove trailing 0's
	if strings.Contains(str, ".") {
		str = strings.TrimRight(str, "0")
		str = strings.TrimRight(str, ".")
	}

	return str
}

// sString is used mostly to strip insecure html data
// from strings
func sString(str string) string {
	// use standard html escaping policy
	p := bluemonday.UGCPolicy()

	// match only colors for html editor elements on style attr
	p.AllowAttrs("style").OnElements("span", "p")
	p.AllowStyles("color").Matching(regexp.MustCompile("(?i)^#([0-9a-f]{3,4}|[0-9a-f]{6}|[0-9a-f]{8})$")).Globally()
	p.AllowStyles("background-color").Matching(regexp.MustCompile("(?i)^#([0-9a-f]{3,4}|[0-9a-f]{6}|[0-9a-f]{8})$")).Globally()

	sanitized := p.Sanitize(str)

	// handle escaped strings and unescape them
	// all the dangerous chars should have been stripped
	// by now
	return html.UnescapeString(sanitized)
}

// sanitize casts value to field kind format
func sanitize(f *types.ModuleField, v interface{}) string {
	switch strings.ToLower(f.Kind) {
	case "bool":
		return sBool(v)
	case "datetime":
		v = sDatetime(v, f.Options.Bool("onlyDate"), f.Options.Bool("onlyTime"))
	case "number":
		v = sNumber(v, f.Options.Precision())
	}

	return fmt.Sprintf("%v", v)
}