From ee7df7ba8c5e6a4b32b0c4048d2b535d8df3cbe9 Mon Sep 17 00:00:00 2001
From: Alexander Scheel <alexander.m.scheel@gmail.com>
Date: Sat, 7 Dec 2019 14:49:04 -0500
Subject: [PATCH] Markdown: Sanitizier Configuration (#9075)

* Support custom sanitization policy

Allowing the gitea administrator to configure sanitization policy allows
them to couple external renders and custom templates to support more
markup. In particular, the `pandoc` renderer allows generating KaTeX
annotations, wrapping them in `<span>` elements with class `math` and
either `inline` or `display` (depending on whether or not inline or
block mode was requested).

This iteration gives the administrator whitelisting powers; carefully
crafted regexes will thus let through only the desired attributes
necessary to support their custom markup.

Resolves: #9054

Signed-off-by: Alexander Scheel <alexander.m.scheel@gmail.com>

* Document new sanitization configuration

 - Adds basic documentation to app.ini.sample,
 - Adds an example to the Configuration Cheat Sheet, and
 - Adds extended information to External Renderers section.

Signed-off-by: Alexander Scheel <alexander.m.scheel@gmail.com>

* Drop extraneous length check in newMarkupSanitizer(...)

Signed-off-by: Alexander Scheel <alexander.m.scheel@gmail.com>

* Fix plural ELEMENT and ALLOW_ATTR in docs

These were left over from their initial names. Make them singular to
conform with the current expectations.

Signed-off-by: Alexander Scheel <alexander.m.scheel@gmail.com>
---
 custom/conf/app.ini.sample                    |   6 +
 .../doc/advanced/config-cheat-sheet.en-us.md  |  18 +++
 .../doc/advanced/external-renderers.en-us.md  |  18 +++
 modules/markup/sanitizer.go                   |   9 ++
 modules/setting/markup.go                     | 133 ++++++++++++++----
 5 files changed, 155 insertions(+), 29 deletions(-)

diff --git a/custom/conf/app.ini.sample b/custom/conf/app.ini.sample
index 8d11cfc293..050a0db730 100644
--- a/custom/conf/app.ini.sample
+++ b/custom/conf/app.ini.sample
@@ -877,6 +877,12 @@ SHOW_FOOTER_VERSION = true
 ; Show template execution time in the footer
 SHOW_FOOTER_TEMPLATE_LOAD_TIME = true
 
+[markup.sanitizer]
+; The following keys can be used multiple times to define sanitation policy rules.
+;ELEMENT = span
+;ALLOW_ATTR = class
+;REGEXP = ^(info|warning|error)$
+
 [markup.asciidoc]
 ENABLED = false
 ; List of file extensions that should be rendered by an external command
diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md
index 9f02e888cf..0d7a641b19 100644
--- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md
+++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md
@@ -578,6 +578,24 @@ Two special environment variables are passed to the render command:
 - `GITEA_PREFIX_SRC`, which contains the current URL prefix in the `src` path tree. To be used as prefix for links.
 - `GITEA_PREFIX_RAW`, which contains the current URL prefix in the `raw` path tree. To be used as prefix for image paths.
 
+
+Gitea supports customizing the sanitization policy for rendered HTML. The example below will support KaTeX output from pandoc.
+
+```ini
+[markup.sanitizer]
+; Pandoc renders TeX segments as <span>s with the "math" class, optionally
+; with "inline" or "display" classes depending on context.
+ELEMENT = span
+ALLOW_ATTR = class
+REGEXP = ^\s*((math(\s+|$)|inline(\s+|$)|display(\s+|$)))+
+```
+
+ - `ELEMENT`: The element this policy applies to. Must be non-empty.
+ - `ALLOW_ATTR`: The attribute this policy allows. Must be non-empty.
+ - `REGEXP`: A regex to match the contents of the attribute against. Must be present but may be empty for unconditional whitelisting of this attribute.
+
+You may redefine `ELEMENT`, `ALLOW_ATTR`, and `REGEXP` multiple times; each time all three are defined is a single policy entry.
+
 ## Time (`time`)
 
 - `FORMAT`: Time format to diplay on UI. i.e. RFC1123 or 2006-01-02 15:04:05
diff --git a/docs/content/doc/advanced/external-renderers.en-us.md b/docs/content/doc/advanced/external-renderers.en-us.md
index a14f344e63..ec1ee63fb6 100644
--- a/docs/content/doc/advanced/external-renderers.en-us.md
+++ b/docs/content/doc/advanced/external-renderers.en-us.md
@@ -68,4 +68,22 @@ RENDER_COMMAND = rst2html.py
 IS_INPUT_FILE = false
 ```
 
+If your external markup relies on additional classes and attributes on the generated HTML elements, you might need to enable custom sanitizer policies. Gitea uses the [`bluemonday`](https://godoc.org/github.com/microcosm-cc/bluemonday) package as our HTML sanitizier. The example below will support [KaTeX](https://katex.org/) output from [`pandoc`](https://pandoc.org/).
+
+```ini
+[markup.sanitizer]
+; Pandoc renders TeX segments as <span>s with the "math" class, optionally
+; with "inline" or "display" classes depending on context.
+ELEMENT = span
+ALLOW_ATTR = class
+REGEXP = ^\s*((math(\s+|$)|inline(\s+|$)|display(\s+|$)))+
+
+[markup.markdown]
+ENABLED         = true
+FILE_EXTENSIONS = .md,.markdown
+RENDER_COMMAND  = pandoc -f markdown -t html --katex
+```
+
+You may redefine `ELEMENT`, `ALLOW_ATTR`, and `REGEXP` multiple times; each time all three are defined is a single policy entry. All three must be defined, but `REGEXP` may be blank to allow unconditional whitelisting of that attribute.
+
 Once your configuration changes have been made, restart Gitea to have changes take effect.
diff --git a/modules/markup/sanitizer.go b/modules/markup/sanitizer.go
index 0ebb3ff88b..f7789a9e56 100644
--- a/modules/markup/sanitizer.go
+++ b/modules/markup/sanitizer.go
@@ -50,6 +50,15 @@ func ReplaceSanitizer() {
 
 	// Allow <kbd> tags for keyboard shortcut styling
 	sanitizer.policy.AllowElements("kbd")
+
+	// Custom keyword markup
+	for _, rule := range setting.ExternalSanitizerRules {
+		if rule.Regexp != nil {
+			sanitizer.policy.AllowAttrs(rule.AllowAttr).Matching(rule.Regexp).OnElements(rule.Element)
+		} else {
+			sanitizer.policy.AllowAttrs(rule.AllowAttr).OnElements(rule.Element)
+		}
+	}
 }
 
 // Sanitize takes a string that contains a HTML fragment or document and applies policy whitelist.
diff --git a/modules/setting/markup.go b/modules/setting/markup.go
index 41f3cdd3a1..75e6d651bd 100644
--- a/modules/setting/markup.go
+++ b/modules/setting/markup.go
@@ -9,11 +9,14 @@ import (
 	"strings"
 
 	"code.gitea.io/gitea/modules/log"
+
+	"gopkg.in/ini.v1"
 )
 
 // ExternalMarkupParsers represents the external markup parsers
 var (
-	ExternalMarkupParsers []MarkupParser
+	ExternalMarkupParsers  []MarkupParser
+	ExternalSanitizerRules []MarkupSanitizerRule
 )
 
 // MarkupParser defines the external parser configured in ini
@@ -25,8 +28,15 @@ type MarkupParser struct {
 	IsInputFile    bool
 }
 
+// MarkupSanitizerRule defines the policy for whitelisting attributes on
+// certain elements.
+type MarkupSanitizerRule struct {
+	Element   string
+	AllowAttr string
+	Regexp    *regexp.Regexp
+}
+
 func newMarkup() {
-	extensionReg := regexp.MustCompile(`\.\w`)
 	for _, sec := range Cfg.Section("markup").ChildSections() {
 		name := strings.TrimPrefix(sec.Name(), "markup.")
 		if name == "" {
@@ -34,33 +44,98 @@ func newMarkup() {
 			continue
 		}
 
-		extensions := sec.Key("FILE_EXTENSIONS").Strings(",")
-		var exts = make([]string, 0, len(extensions))
-		for _, extension := range extensions {
-			if !extensionReg.MatchString(extension) {
-				log.Warn(sec.Name() + " file extension " + extension + " is invalid. Extension ignored")
-			} else {
-				exts = append(exts, extension)
-			}
+		if name == "sanitizer" {
+			newMarkupSanitizer(name, sec)
+		} else {
+			newMarkupRenderer(name, sec)
 		}
-
-		if len(exts) == 0 {
-			log.Warn(sec.Name() + " file extension is empty, markup " + name + " ignored")
-			continue
-		}
-
-		command := sec.Key("RENDER_COMMAND").MustString("")
-		if command == "" {
-			log.Warn(" RENDER_COMMAND is empty, markup " + name + " ignored")
-			continue
-		}
-
-		ExternalMarkupParsers = append(ExternalMarkupParsers, MarkupParser{
-			Enabled:        sec.Key("ENABLED").MustBool(false),
-			MarkupName:     name,
-			FileExtensions: exts,
-			Command:        command,
-			IsInputFile:    sec.Key("IS_INPUT_FILE").MustBool(false),
-		})
 	}
 }
+
+func newMarkupSanitizer(name string, sec *ini.Section) {
+	haveElement := sec.HasKey("ELEMENT")
+	haveAttr := sec.HasKey("ALLOW_ATTR")
+	haveRegexp := sec.HasKey("REGEXP")
+
+	if !haveElement && !haveAttr && !haveRegexp {
+		log.Warn("Skipping empty section: markup.%s.", name)
+		return
+	}
+
+	if !haveElement || !haveAttr || !haveRegexp {
+		log.Error("Missing required keys from markup.%s. Must have all three of ELEMENT, ALLOW_ATTR, and REGEXP defined!", name)
+		return
+	}
+
+	elements := sec.Key("ELEMENT").ValueWithShadows()
+	allowAttrs := sec.Key("ALLOW_ATTR").ValueWithShadows()
+	regexps := sec.Key("REGEXP").ValueWithShadows()
+
+	if len(elements) != len(allowAttrs) ||
+		len(elements) != len(regexps) {
+		log.Error("All three keys in markup.%s (ELEMENT, ALLOW_ATTR, REGEXP) must be defined the same number of times! Got %d, %d, and %d respectively.", name, len(elements), len(allowAttrs), len(regexps))
+		return
+	}
+
+	ExternalSanitizerRules = make([]MarkupSanitizerRule, 0, len(elements))
+
+	for index, pattern := range regexps {
+		if pattern == "" {
+			rule := MarkupSanitizerRule{
+				Element:   elements[index],
+				AllowAttr: allowAttrs[index],
+				Regexp:    nil,
+			}
+			ExternalSanitizerRules = append(ExternalSanitizerRules, rule)
+			continue
+		}
+
+		// Validate when parsing the config that this is a valid regular
+		// expression. Then we can use regexp.MustCompile(...) later.
+		compiled, err := regexp.Compile(pattern)
+		if err != nil {
+			log.Error("In module.%s: REGEXP at definition %d failed to compile: %v", name, index+1, err)
+			continue
+		}
+
+		rule := MarkupSanitizerRule{
+			Element:   elements[index],
+			AllowAttr: allowAttrs[index],
+			Regexp:    compiled,
+		}
+		ExternalSanitizerRules = append(ExternalSanitizerRules, rule)
+	}
+}
+
+func newMarkupRenderer(name string, sec *ini.Section) {
+	extensionReg := regexp.MustCompile(`\.\w`)
+
+	extensions := sec.Key("FILE_EXTENSIONS").Strings(",")
+	var exts = make([]string, 0, len(extensions))
+	for _, extension := range extensions {
+		if !extensionReg.MatchString(extension) {
+			log.Warn(sec.Name() + " file extension " + extension + " is invalid. Extension ignored")
+		} else {
+			exts = append(exts, extension)
+		}
+	}
+
+	if len(exts) == 0 {
+		log.Warn(sec.Name() + " file extension is empty, markup " + name + " ignored")
+		return
+	}
+
+	command := sec.Key("RENDER_COMMAND").MustString("")
+	if command == "" {
+		log.Warn(" RENDER_COMMAND is empty, markup " + name + " ignored")
+		return
+	}
+
+	ExternalMarkupParsers = append(ExternalMarkupParsers, MarkupParser{
+		Enabled:        sec.Key("ENABLED").MustBool(false),
+		MarkupName:     name,
+		FileExtensions: exts,
+		Command:        command,
+		IsInputFile:    sec.Key("IS_INPUT_FILE").MustBool(false),
+	})
+}