[Git][cmucl/cmucl][issue-120-software-type-in-c] 5 commits: Address #158: Filename encoding for Darwin

Raymond Toy (@rtoy) gitlab at common-lisp.net
Sat Mar 25 14:48:55 UTC 2023



Raymond Toy pushed to branch issue-120-software-type-in-c at cmucl / cmucl


Commits:
5dc0d7bf by Raymond Toy at 2023-03-24T14:31:40+00:00
Address #158: Filename encoding for Darwin

- - - - -
3578e015 by Raymond Toy at 2023-03-24T14:31:42+00:00
Merge branch 'issue-158-darwin-pathnames-utf8' into 'master'

Address #158: Filename encoding for Darwin

Closes #166 and #159

See merge request cmucl/cmucl!113
- - - - -
ce823be4 by Raymond Toy at 2023-03-24T08:17:30-07:00
Update release notes with recently closed issues

We left out a few issues that probably aren't relevant to users
like #175 and #170.  We also added #158 which isn't closed, but
explains we support utf-8 on Darwin.

- - - - -
b9b145ab by Raymond Toy at 2023-03-24T08:23:42-07:00
Add period at the end of each bug item

- - - - -
a06dad9b by Raymond Toy at 2023-03-25T07:48:32-07:00
Merge branch 'master' into issue-120-software-type-in-c

- - - - -


8 changed files:

- src/code/pathname.lisp
- src/code/save.lisp
- src/code/string.lisp
- src/code/unicode.lisp
- src/general-info/release-21e.md
- src/i18n/locale/cmucl.pot
- tests/issues.lisp
- + tests/resources/darwin/안녕하십니까.txt


Changes:

=====================================
src/code/pathname.lisp
=====================================
@@ -252,6 +252,14 @@
 ;;; This constructor is used to make an instance of the correct type
 ;;; from parsed arguments.
 
+#+darwin
+(defvar *enable-darwin-path-normalization* nil
+  "When non-NIL, pathnames are on Darwin are normalized when created.
+  Otherwise, the pathnames are unchanged.
+
+  This must be NIL during bootstrapping because Unicode is not yet
+  available.")
+
 (defun %make-pathname-object (host device directory name type version)
   (if (typep host 'logical-host)
       (flet ((upcasify (thing)
@@ -271,7 +279,30 @@
 				(upcasify name)
 				(upcasify type)
 				(upcasify version)))
-      (%make-pathname host device directory name type version)))
+      #-darwin
+      (%make-pathname host device directory name type version)
+      #+darwin
+      (flet ((normalize-name (piece)
+	       ;; Normalize Darwin pathnames by converting Hangul
+	       ;; syllables to conjoining jamo, and converting the
+	       ;; string to NFD form, but skipping over a range of
+	       ;; characters.
+	       (typecase piece
+		 (string
+		  (if *enable-darwin-path-normalization*
+		      (decompose (unicode::decompose-hangul piece)
+				 :compatibility nil
+				 :darwinp t)
+		      piece))
+		 (t
+		  ;; What should we do about lisp::pattern objects
+		  ;; that occur in the name component?
+		  piece))))
+	(%make-pathname host device
+			(mapcar #'normalize-name directory)
+			(normalize-name name)
+			(normalize-name type)
+			version))))
 
 ;;; *LOGICAL-HOSTS* --internal.
 ;;;


=====================================
src/code/save.lisp
=====================================
@@ -202,7 +202,7 @@
 				 (site-init "library:site-init")
 				 (print-herald t)
 				 (process-command-line t)
-		                  #+:executable
+		                 #+:executable
 		                 (executable nil)
 				 (batch-mode nil)
 				 (quiet nil))


=====================================
src/code/string.lisp
=====================================
@@ -1097,7 +1097,10 @@
 
 #+unicode
 (progn
-(defun decompose (string &optional (compatibility t))
+(defun decompose (string &key (compatibility t) (start 0) end darwinp)
+  "Convert STRING to NFD (or NFKD).  If :darwinp is non-NIL, then
+  characters in the ranges U2000-U2FFF, UF900-UFA6A, and U2F800-U2FA1D
+  are not decomposed, as specified for Darwin pathnames."
   (declare (type string string))
   (let ((result (make-string (cond ((< (length string) 40)
 				    (* 5 (length string)))
@@ -1113,8 +1116,13 @@
 		 (declare (type kernel:index i))
 		 (multiple-value-bind (code wide) (codepoint string i)
 		   (when wide (incf i))
-		   (let ((decomp (unicode-decomp code compatibility)))
-		     (if decomp (rec decomp 0 (length decomp)) (out code))))))
+		   (if (and darwinp
+			    (or (<= #x2000 code #x2fff)
+				(<= #xf900 code #xfa6a)
+				(<= #x2f800 code #x2fa1d)))
+		       (out code)
+		       (let ((decomp (unicode-decomp code compatibility)))
+			 (if decomp (rec decomp 0 (length decomp)) (out code)))))))
 	     (out (code)
 	       (multiple-value-bind (hi lo) (surrogates code)
 		 (outch hi)
@@ -1151,7 +1159,7 @@
 					  (schar result (1+ last)))))
 			    (decf last (if wide2 2 1)))
 			   (t (return))))))))
-      (with-string string
+      (with-one-string string start end offset-var
 	(rec string start end))
       (shrink-vector result fillptr))))
 
@@ -1251,12 +1259,12 @@
 (defun string-to-nfd (string)
   _N"Convert String to Unicode Normalization Form D (NFD) using the
   canonical decomposition.  The NFD string is returned"
-  (decompose string nil))
+  (decompose string :compatibility nil))
 
 (defun string-to-nfkd (string)
   _N"Convert String to Unicode Normalization Form KD (NFKD) uisng the
   compatible decomposition form.  The NFKD string is returned."
-  (decompose string t))
+  (decompose string :compatibility t))
 
 (defun string-to-nfc (string)
   _N"Convert String to Unicode Normalization Form C (NFC).  If the


=====================================
src/code/unicode.lisp
=====================================
@@ -517,3 +517,55 @@
       (if (eq casing :simple)
 	  (cl:string-capitalize string :start start :end end)
 	  (string-capitalize-full string :start start :end end :casing casing))))
+
+
+(defun decompose-hangul-syllable (cp stream)
+  "Decompose the Hangul syllable codepoint CP to an equivalent sequence
+  of conjoining jamo and print the decomposed result to the stream
+  STREAM."
+  (let* ((s-base #xac00)
+	 (l-base #x1100)
+	 (v-base #x1161)
+	 (t-base #x11a7)
+	 (v-count 21)
+	 (t-count 28)
+	 (n-count (* v-count t-count)))
+    ;; Step 1: Compute index of the syllable S
+    (let ((s-index (- cp s-base)))
+      ;; Step 2: If s is in the range 0 <= s <= s-count, the compute
+      ;; the components.
+      (let ((l (+ l-base (truncate s-index n-count)))
+	    (v (+ v-base (truncate (mod s-index n-count) t-count)))
+	    (tt (+ t-base (mod s-index t-count))))
+	;; Step 3: If tt = t-base, then there is no trailing character
+	;; so replace s by the sequence <l,v>.  Otherwise there is a
+	;; trailing character, so replace s by the sequence <l,v,tt>.
+	(princ (code-char l) stream)
+	(princ (code-char v) stream)
+	(unless (= tt t-base)
+	  (princ (code-char tt) stream)))))
+  (values))
+
+(defun is-hangul-syllable (codepoint)
+  "Test if CODEPOINT is a Hangul syllable"
+  (let* ((s-base #xac00)
+	 (l-count 19)
+	 (v-count 21)
+	 (t-count 28)
+	 (n-count (* v-count t-count))
+	 (number-of-syllables (* l-count n-count)))
+    (<= 0 (- codepoint s-base) number-of-syllables)))
+
+(defun decompose-hangul (string)
+  "Decompose any Hangul syllables in STRING to an equivalent sequence of
+  conjoining jamo characters."
+  (with-output-to-string (s)
+    (loop for cp being the codepoints of string
+	  do
+	     (if (is-hangul-syllable cp)
+		 (decompose-hangul-syllable cp s)
+		 (multiple-value-bind (high low)
+		     (surrogates cp)
+		   (princ high s)
+		   (when low
+		     (princ low s)))))))


=====================================
src/general-info/release-21e.md
=====================================
@@ -22,63 +22,68 @@ public domain.
   * Feature enhancements
   * Changes
     * Update to ASDF 3.3.6
-    * The default external format is `:utf-8` instead of `:iso8859-1`
+    * The default external format is `:utf-8` instead of `:iso8859-1`.
   * ANSI compliance fixes:
   * Bug fixes:
     * ~~#97~~ Fixes stepping through the source forms in the debugger.  This has been broken for quite some time, but it works now.
 
   * Gitlab tickets:
-    * ~~#68~~ gcc8.1.1 can't build lisp.  Change optimization from `-O2` to `-O1`
-    * ~~#72~~ CMU user manual now part of cmucl-site
-    * ~~#73~~ Update clx from upstream clx
-    * ~~#77~~ Added tests for sqrt for exceptional values
+    * ~~#68~~ gcc8.1.1 can't build lisp.  Change optimization from `-O2` to `-O1`.
+    * ~~#72~~ CMU user manual now part of cmucl-site.
+    * ~~#73~~ Update clx from upstream clx.
+    * ~~#77~~ Added tests for sqrt for exceptional values.
     * ~~#79~~ Autoload ASDF when calling `REQUIRE` the first time.  User's no longer have to explicitly load ASDF anymore.
     * ~~#80~~ Use ASDF to load contribs.  cmu-contribs still exists but does nothing.  The contrib names are the same, except it's best to use a keyword instead of a string.  So, `:contrib-demos` instead of `"contrib-demos"`.
-    * ~~#81~~ Added contribs from Eric Marsden
-    * ~~#82~~ Replace bc with expr in GNUMakefile
-    * ~~#86~~ Building with gcc 8 and later works when using -O2 optimization
+    * ~~#81~~ Added contribs from Eric Marsden.
+    * ~~#82~~ Replace bc with expr in GNUMakefile.
+    * ~~#86~~ Building with gcc 8 and later works when using -O2 optimization.
     * ~~#90~~ Some static symbols have been removed.  This probably makes the fasl files incompatible with older versions.
-    * ~~#91~~ Loop destructuring no longer incorrectly signals an error
-    * ~~#95~~ Disassembler syntax of x86 je and movzx is incorrect
+    * ~~#91~~ Loop destructuring no longer incorrectly signals an error.
+    * ~~#95~~ Disassembler syntax of x86 je and movzx is incorrect.
     * ~~#97~~ Define and use ud2 instruction instead of int3.  Fixes single-stepping.
-    * ~~#98~~ fstpd is not an Intel instruction; disassemble as `fstp dword ptr [addr]`
+    * ~~#98~~ fstpd is not an Intel instruction; disassemble as `fstp dword ptr [addr]`.
     * ~~#100~~ ldb prints out Unicode base-chars correctly instead of just the low 8 bits.
-    * ~~#103~~ RANDOM-MT19937-UPDATE assembly routine still exists
+    * ~~#103~~ RANDOM-MT19937-UPDATE assembly routine still exists.
     * ~~#104~~ Single-stepping broken (fixed via #97).
-    * ~~#107~~ Replace u_int8_t with uint8_t
-    * ~~#108~~ Update ASDF
-    * ~~#112~~ CLX can't connect to X server via inet sockets
+    * ~~#107~~ Replace u_int8_t with uint8_t.
+    * ~~#108~~ Update ASDF.
+    * ~~#112~~ CLX can't connect to X server via inet sockets.
     * ~~#113~~ REQUIRE on contribs can pull in the wrong things via ASDF..
-    * ~~#120~~ `SOFTWARE-TYPE` and `SOFTWARE-VERSION` are implemented in C.
+    * ~~#120~~ `SOFTWARE-VERSION` is implemented in C.
     * ~~#121~~ Wrong column index in FILL-POINTER-OUTPUT-STREAM
     * ~~#122~~ gcc 11 can't build cmucl
     * ~~#124~~ directory with `:wild-inferiors` doesn't descend subdirectories 
     * ~~#125~~ Linux `unix-stat` returning incorrect values
     * ~~#127~~ Linux unix-getpwuid segfaults when given non-existent uid..
-    * ~~#128~~ `QUIT` accepts an exit code
-    * ~~#130~~ Move file-author to C 
-    * ~~#132~~ Ansi test `RENAME-FILE.1` no longer fails
-    * ~~#134~~ Handle the case of `(expt complex complex-rational)`
-    * ~~#136~~ `ensure-directories-exist` should return the given pathspec
-    * #139 `*default-external-format*` defaults to `:utf-8`; add alias for `:locale` external format
-    * ~~#140~~ External format for streams that are not `file-stream`'s
-    * ~~#141~~ Disallow locales that are pathnames to a localedef file
-    * ~~#142~~ `(random 0)` signals incorrect error
-    * ~~#147~~ `stream-line-column` method missing for `fundamental-character-output-stream`
-    * ~~#149~~ Call setlocale(3C) on startup
-    * ~~#150~~ Add aliases for external format cp949 and euckr
+    * ~~#128~~ `QUIT` accepts an exit code.
+    * ~~#130~~ Move file-author to C.
+    * ~~#132~~ Ansi test `RENAME-FILE.1` no longer fails.
+    * ~~#134~~ Handle the case of `(expt complex complex-rational)`.
+    * ~~#136~~ `ensure-directories-exist` should return the given pathspec.
+    * #139 `*default-external-format*` defaults to `:utf-8`; add alias for `:locale` external format.
+    * ~~#140~~ External format for streams that are not `file-stream`'s.
+    * ~~#141~~ Disallow locales that are pathnames to a localedef file.
+    * ~~#142~~ `(random 0)` signals incorrect error.
+    * ~~#147~~ `stream-line-column` method missing for `fundamental-character-output-stream`.
+    * ~~#149~~ Call setlocale(3C) on startup.
+    * ~~#150~~ Add aliases for external format cp949 and euckr.
     * ~~#151~~ Change `*default-external-format*` to `:utf-8`.
-    * ~~#155~~ Wrap help strings neatly
-    * ~~#157~~ `(directory "foo/**/")` only returns directories now
-    * ~~#163~~ Add command-line option `-version` and `--version` to get lisp version
-    * ~~#165~~ Avoid inserting NIL into simple `LOOP` from `FORMAT`
-    * ~~#166~~ Fix incorrect type declaration for exponent from `integer-decode-float`
+    * ~~#152~~ Add new external format, `:locale` as an alias to the codeset from LANG and friends.
+    * ~~#!53~~ Terminals default to an encoding of `:locale`.
+    * ~~#155~~ Wrap help strings neatly.
+    * ~~#157~~ `(directory "foo/**/")` only returns directories now.
+    * #158 Darwin uses utf-8, but we don't support all the rules for pathnames.
+    * ~~#162~~ `*filename-encoding*` defaults to `:null` to mean no encoding.
+    * ~~#163~~ Add command-line option `-version` and `--version` to get lisp version.
+    * ~~#165~~ Avoid inserting NIL into simple `LOOP` from `FORMAT`.
+    * ~~#166~~ Fix incorrect type declaration for exponent from `integer-decode-float`.
     * ~~#167~~ Low bound for `decode-float-exponent` type was off by one..
-    * ~~#168~~ Don't use negated forms for jmp instructions when possible
-    * ~~#169~~ Add pprinter for `define-vop` and `sc-case`
+    * ~~#168~~ Don't use negated forms for jmp instructions when possible.
+    * ~~#169~~ Add pprinter for `define-vop` and `sc-case`.
     * ~~#172~~ Declare `pathname-match-p` as returning `nil` or `pathname`.
-    * ~~#173~~ Add pprinter for `define-assembly-routine`
+    * ~~#173~~ Add pprinter for `define-assembly-routine`.
     * ~~#176~~ `SHORT-SITE-NAME` and `LONG-SITE-NAME` return `NIL`.
+    * ~~#177~~ Add pprinter for `deftransform` and `defoptimizer`.
   * Other changes:
   * Improvements to the PCL implementation of CLOS:
   * Changes to building procedure:


=====================================
src/i18n/locale/cmucl.pot
=====================================
@@ -4012,6 +4012,14 @@ msgid ""
 "  string is returned."
 msgstr ""
 
+#: src/code/string.lisp
+msgid ""
+"Convert String to NFD (or NFKD).  If :darwinp is non-NIL, then\n"
+"    characters in the ranges U2000-U2FFF, UF900-UFA6A, and\n"
+"    U2F800-U2FA1D are not decomposed, as specified for Darwin\n"
+"    pathnames."
+msgstr ""
+
 #: src/code/string.lisp
 msgid ""
 "Convert a sequence of codepoints to a string.  Codepoints outside\n"
@@ -15267,6 +15275,23 @@ msgid ""
 "  delimited by non-case-modifiable chars.  "
 msgstr ""
 
+#: src/code/unicode.lisp
+msgid ""
+"Decompose the Hangul syllable codepoint CP to an equivalent sequence\n"
+"  of conjoining jamo and print the decomposed result to the stream\n"
+"  STREAM."
+msgstr ""
+
+#: src/code/unicode.lisp
+msgid "Test if CODEPOINT is a Hangul syllable"
+msgstr ""
+
+#: src/code/unicode.lisp
+msgid ""
+"Decompose any Hangul syllables in STRING to an equivalent sequence of\n"
+"  conjoining jamo characters."
+msgstr ""
+
 #: src/compiler/macros.lisp
 msgid ""
 "Policy Node Condition*\n"


=====================================
tests/issues.lisp
=====================================
@@ -832,6 +832,54 @@
 
 
 
+(define-test issue.158
+    (:tag :issues)
+  (let* ((name (string #\Hangul_Syllable_Gyek))
+	 (path (make-pathname :directory (list :relative name)
+			      :name name
+			      :type name)))
+    ;; Enable this when we implement normalization for Darwin
+    #+(and nil darwin)
+    (let ((expected '(4352 4456 4543)))
+      ;; Tests that on Darwin the Hangul pathname has been normalized
+      ;; correctly.  We fill in the directory, name, and type components
+      ;; with the same thing since it shouldn't really matter.
+      ;;
+      ;; The expected value is the conjoining jamo for the character
+      ;; #\Hangul_Syllable_Gyek.
+      (assert-equal (map 'list #'char-code (second (pathname-directory path)))
+		    expected)
+      (assert-equal (map 'list #'char-code (pathname-name path))
+		    expected)
+      (assert-equal (map 'list #'char-code (pathname-type path))
+		    expected))
+    #-darwin
+    (let ((expected (list (char-code #\Hangul_Syllable_Gyek))))
+      ;; For other OSes, just assume that the pathname is unchanged.
+      (assert-equal (map 'list #'char-code (second (pathname-directory path)))
+		    expected)
+      (assert-equal (map 'list #'char-code (pathname-name path))
+		    expected)
+      (assert-equal (map 'list #'char-code (pathname-type path))
+		    expected))))
+
+(define-test issue.158.dir
+    (:tag :issues)
+  (flet ((get-file ()
+	   ;; This assumes that there is only one file in resources/darwin
+	   (let ((files (directory (merge-pathnames "resources/darwin/*.txt" *test-path*))))
+	     (assert-equal (length files) 1)
+	     (first files))))
+    (let ((f (get-file))
+	  (expected-name "안녕하십니까"))
+      #+darwin
+      (assert-equal (pathname-name f)
+		    (unicode::decompose-hangul expected-name))
+      #-darwin
+      (assert-equal (pathname-name f) expected-name))))
+    
+
+
 (define-test issue.166
     (:tag :issues)
   ;; While this tests for the correct return value, the problem was
@@ -896,4 +944,3 @@
     (assert-true (typep idf-max-expo 'kernel:double-float-int-exponent))
     (assert-true (typep (1- idf-max-expo) 'kernel:double-float-int-exponent))
     (assert-false (typep (1+ idf-max-expo) 'kernel:double-float-int-exponent))))
-    


=====================================
tests/resources/darwin/안녕하십니까.txt
=====================================
@@ -0,0 +1,3 @@
+The file name of this file is "안녕하십니까.txt" ("Hello" in Korean.)
+
+



View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/531ea53c4501269b59aa81e3bdc70778fd0325ac...a06dad9ba90b1ea416f9bd93cd41029df642e6f2

-- 
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/531ea53c4501269b59aa81e3bdc70778fd0325ac...a06dad9ba90b1ea416f9bd93cd41029df642e6f2
You're receiving this email because of your account on gitlab.common-lisp.net.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://mailman.common-lisp.net/pipermail/cmucl-cvs/attachments/20230325/8d31a98c/attachment-0001.html>


More information about the cmucl-cvs mailing list