From 641a5b7a1447954076728f259342c2f9201bb0b5 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Fri, 1 Nov 2024 12:46:51 -0400 Subject: [PATCH] doc: improve build for non-Latin1 characters Add README.non-ASCII to explain non-ASCII doc behavior; some text moved from release.sgml. Change UTF8 SGML characters to use HTML entities. Remove unnecessary UTF8 spaces. Add SVG file check for check-nbsp target. Add dummy 'pdf' Makefile target. Reported-by: Yugo Nagata Discussion: https://postgr.es/m/20241011114122.c90f8a871462da36f2e2afeb@sraoss.co.jp Backpatch-through: master --- doc/src/sgml/Makefile | 11 ++++--- doc/src/sgml/README.non-ASCII | 37 +++++++++++++++++++++++ doc/src/sgml/charset.sgml | 10 +++--- doc/src/sgml/images/genetic-algorithm.svg | 4 +-- doc/src/sgml/release.sgml | 18 ----------- doc/src/sgml/stylesheet-man.xsl | 12 ++++---- 6 files changed, 56 insertions(+), 36 deletions(-) create mode 100644 doc/src/sgml/README.non-ASCII diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile index 65ed32cd0a..12f506c960 100644 --- a/doc/src/sgml/Makefile +++ b/doc/src/sgml/Makefile @@ -59,7 +59,7 @@ GENERATED_SGML = version.sgml \ features-supported.sgml features-unsupported.sgml errcodes-table.sgml \ keywords-table.sgml targets-meson.sgml wait_event_types.sgml -ALLSGML := $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml) $(GENERATED_SGML) +ALL_SGML := $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml) $(GENERATED_SGML) ALL_IMAGES := $(wildcard $(srcdir)/images/*.svg) @@ -68,7 +68,7 @@ ALL_IMAGES := $(wildcard $(srcdir)/images/*.svg) # we're at it, also resolve all entities (that is, copy all included # files into one big file). This helps tools that don't understand # vpath builds (such as dbtoepub). -postgres-full.xml: postgres.sgml $(ALLSGML) +postgres-full.xml: postgres.sgml $(ALL_SGML) $(XMLLINT) $(XMLINCLUDE) --output $@ --noent --valid $< @@ -143,11 +143,12 @@ postgres.txt: postgres.html ## Print ## -postgres.pdf: +postgres.pdf pdf: $(error Invalid target; use postgres-A4.pdf or postgres-US.pdf as targets) XSLTPROC_FO_FLAGS += --stringparam img.src.path '$(srcdir)/' +# XSL Formatting Objects (FO), https://en.wikipedia.org/wiki/XSL_Formatting_Objects %-A4.fo: stylesheet-fo.xsl %-full.xml $(XSLTPROC) $(XMLINCLUDE) $(XSLTPROCFLAGS) $(XSLTPROC_FO_FLAGS) --stringparam paper.type A4 -o $@ $^ @@ -194,7 +195,7 @@ MAKEINFO = makeinfo ## # Quick syntax check without style processing -check: postgres.sgml $(ALLSGML) check-tabs check-nbsp +check: postgres.sgml $(ALL_SGML) check-tabs check-nbsp $(XMLLINT) $(XMLINCLUDE) --noout --valid $< @@ -264,7 +265,7 @@ check-tabs: # Use perl command because non-GNU grep or sed could not have hex escape sequence. check-nbsp: @ ( $(PERL) -ne '/\xC2\xA0/ and print("$$ARGV:$$_"),$$n++; END {exit($$n>0)}' \ - $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl) ) || \ + $(wildcard $(srcdir)/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/images/*.svg $(srcdir)/*.xsl $(srcdir)/images/*.xsl) ) || \ (echo "Non-breaking spaces appear in SGML/XML files" 1>&2; exit 1) ## diff --git a/doc/src/sgml/README.non-ASCII b/doc/src/sgml/README.non-ASCII new file mode 100644 index 0000000000..9c21e02e8f --- /dev/null +++ b/doc/src/sgml/README.non-ASCII @@ -0,0 +1,37 @@ + + +Representation of non-ASCII characters +-------------------------------------- + +Find non-ASCII characters using: + + grep --recursive --color='auto' -P '[\x80-\xFF]' . + +Convert to HTML4 named entity (&) escapes +----------------------------------------- + +We support several output formats: + +* html (supports all Unicode characters) +* man (supports all Unicode characters) +* pdf (supports only Latin-1 characters) +* info + +While some output formatting tools support all Unicode characters, +others only support Latin-1 characters. Specifically, the PDF rendering +engine can only display Latin-1 characters; non-Latin-1 Unicode +characters are displayed as "###". + +Therefore, in the SGML files, we only use Latin-1 characters. We +typically encode these characters as HTML entities, e.g., Álvaro. +It is also possible to safely represent Latin-1 characters in UTF8 +encoding for all output formats. + +Do not use UTF numeric character escapes (&#nnn;). + +HTML entities + official: http://www.w3.org/TR/html4/sgml/entities.html + one page: http://www.zipcon.net/~swhite/docs/computers/browsers/entities_page.html + other lists: http://www.zipcon.net/~swhite/docs/computers/browsers/entities.html + http://www.zipcon.net/~swhite/docs/computers/browsers/entities_page.html + https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 1ef5322b91..f5e115e8d6 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -1225,7 +1225,7 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr -- ignore differences in accents and case CREATE COLLATION ignore_accent_case (provider = icu, deterministic = false, locale = 'und-u-ks-level1'); -SELECT 'Å' = 'A' COLLATE ignore_accent_case; -- true +SELECT 'Å' = 'A' COLLATE ignore_accent_case; -- true SELECT 'z' = 'Z' COLLATE ignore_accent_case; -- true -- upper case letters sort before lower case. @@ -1282,7 +1282,7 @@ SELECT 'w;x*y-z' = 'wxyz' COLLATE num_ignore_punct; -- true 'ab' = U&'a\2063b' 'x-y' = 'x_y' 'g' = 'G' - 'n' = 'ñ' + 'n' = 'ñ' 'y' = 'z' @@ -1346,7 +1346,7 @@ SELECT 'w;x*y-z' = 'wxyz' COLLATE num_ignore_punct; -- true At every level, even with full normalization off, basic normalization is - performed. For example, 'á' may be composed of the + performed. For example, 'á' may be composed of the code points U&'\0061\0301' or the single code point U&'\00E1', and those sequences will be considered equal even at the identic level. To treat @@ -1430,8 +1430,8 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false false Backwards comparison for the level 2 differences. For example, - locale und-u-kb sorts 'àe' - before 'aé'. + locale und-u-kb sorts 'àe' + before 'aé'. diff --git a/doc/src/sgml/images/genetic-algorithm.svg b/doc/src/sgml/images/genetic-algorithm.svg index fb9fdd1ba7..2ce5f1b271 100644 --- a/doc/src/sgml/images/genetic-algorithm.svg +++ b/doc/src/sgml/images/genetic-algorithm.svg @@ -72,7 +72,7 @@ a4->end -true   +true @@ -85,7 +85,7 @@ a4->a5 -false    +false diff --git a/doc/src/sgml/release.sgml b/doc/src/sgml/release.sgml index 8433690dea..cee577ff8d 100644 --- a/doc/src/sgml/release.sgml +++ b/doc/src/sgml/release.sgml @@ -16,24 +16,6 @@ pg_[A-Za-z0-9_]+ , \<[a-z]+_[a-z_]+\> , -non-ASCII characters find using grep -P '[\x80-\xFF]' or - (remove 'X') grep -X-color='auto' -P -n "[\x80-\xFF]" - convert to HTML4 named entity (&) escapes - - official: http://www.w3.org/TR/html4/sgml/entities.html - one page: http://www.zipcon.net/~swhite/docs/computers/browsers/entities_page.html - other lists: http://www.zipcon.net/~swhite/docs/computers/browsers/entities.html - http://www.zipcon.net/~swhite/docs/computers/browsers/entities_page.html - https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references - - We cannot use UTF8 because rendering engines have to - support the referenced characters. - - Do not use numeric _UTF_ numeric character escapes (&#nnn;), - we can only use Latin1. - - Example: Alvaro Herrera is Álvaro Herrera - wrap long lines For new features, add links to the documentation sections. diff --git a/doc/src/sgml/stylesheet-man.xsl b/doc/src/sgml/stylesheet-man.xsl index fcb485c293..2e2564da68 100644 --- a/doc/src/sgml/stylesheet-man.xsl +++ b/doc/src/sgml/stylesheet-man.xsl @@ -213,12 +213,12 @@ - - - - - - + + + + + +