[Git][cmucl/cmucl][master] 2 commits: Address #158: Filename encoding for Darwin
Raymond Toy (@rtoy)
gitlab at common-lisp.net
Fri Mar 24 14:31:56 UTC 2023
Raymond Toy pushed to branch master at cmucl / cmucl
Commits:
5dc0d7bf by Raymond Toy at 2023-03-24T14:31:40+00:00
Address #158: Filename encoding for Darwin
- - - - -
3578e015 by Raymond Toy at 2023-03-24T14:31:42+00:00
Merge branch 'issue-158-darwin-pathnames-utf8' into 'master'
Address #158: Filename encoding for Darwin
Closes #166 and #159
See merge request cmucl/cmucl!113
- - - - -
7 changed files:
- src/code/pathname.lisp
- src/code/save.lisp
- src/code/string.lisp
- src/code/unicode.lisp
- src/i18n/locale/cmucl.pot
- tests/issues.lisp
- + tests/resources/darwin/안녕하십니까.txt
Changes:
=====================================
src/code/pathname.lisp
=====================================
@@ -252,6 +252,14 @@
;;; This constructor is used to make an instance of the correct type
;;; from parsed arguments.
+#+darwin
+(defvar *enable-darwin-path-normalization* nil
+ "When non-NIL, pathnames are on Darwin are normalized when created.
+ Otherwise, the pathnames are unchanged.
+
+ This must be NIL during bootstrapping because Unicode is not yet
+ available.")
+
(defun %make-pathname-object (host device directory name type version)
(if (typep host 'logical-host)
(flet ((upcasify (thing)
@@ -271,7 +279,30 @@
(upcasify name)
(upcasify type)
(upcasify version)))
- (%make-pathname host device directory name type version)))
+ #-darwin
+ (%make-pathname host device directory name type version)
+ #+darwin
+ (flet ((normalize-name (piece)
+ ;; Normalize Darwin pathnames by converting Hangul
+ ;; syllables to conjoining jamo, and converting the
+ ;; string to NFD form, but skipping over a range of
+ ;; characters.
+ (typecase piece
+ (string
+ (if *enable-darwin-path-normalization*
+ (decompose (unicode::decompose-hangul piece)
+ :compatibility nil
+ :darwinp t)
+ piece))
+ (t
+ ;; What should we do about lisp::pattern objects
+ ;; that occur in the name component?
+ piece))))
+ (%make-pathname host device
+ (mapcar #'normalize-name directory)
+ (normalize-name name)
+ (normalize-name type)
+ version))))
;;; *LOGICAL-HOSTS* --internal.
;;;
=====================================
src/code/save.lisp
=====================================
@@ -202,7 +202,7 @@
(site-init "library:site-init")
(print-herald t)
(process-command-line t)
- #+:executable
+ #+:executable
(executable nil)
(batch-mode nil)
(quiet nil))
=====================================
src/code/string.lisp
=====================================
@@ -1097,7 +1097,10 @@
#+unicode
(progn
-(defun decompose (string &optional (compatibility t))
+(defun decompose (string &key (compatibility t) (start 0) end darwinp)
+ "Convert STRING to NFD (or NFKD). If :darwinp is non-NIL, then
+ characters in the ranges U2000-U2FFF, UF900-UFA6A, and U2F800-U2FA1D
+ are not decomposed, as specified for Darwin pathnames."
(declare (type string string))
(let ((result (make-string (cond ((< (length string) 40)
(* 5 (length string)))
@@ -1113,8 +1116,13 @@
(declare (type kernel:index i))
(multiple-value-bind (code wide) (codepoint string i)
(when wide (incf i))
- (let ((decomp (unicode-decomp code compatibility)))
- (if decomp (rec decomp 0 (length decomp)) (out code))))))
+ (if (and darwinp
+ (or (<= #x2000 code #x2fff)
+ (<= #xf900 code #xfa6a)
+ (<= #x2f800 code #x2fa1d)))
+ (out code)
+ (let ((decomp (unicode-decomp code compatibility)))
+ (if decomp (rec decomp 0 (length decomp)) (out code)))))))
(out (code)
(multiple-value-bind (hi lo) (surrogates code)
(outch hi)
@@ -1151,7 +1159,7 @@
(schar result (1+ last)))))
(decf last (if wide2 2 1)))
(t (return))))))))
- (with-string string
+ (with-one-string string start end offset-var
(rec string start end))
(shrink-vector result fillptr))))
@@ -1251,12 +1259,12 @@
(defun string-to-nfd (string)
_N"Convert String to Unicode Normalization Form D (NFD) using the
canonical decomposition. The NFD string is returned"
- (decompose string nil))
+ (decompose string :compatibility nil))
(defun string-to-nfkd (string)
_N"Convert String to Unicode Normalization Form KD (NFKD) uisng the
compatible decomposition form. The NFKD string is returned."
- (decompose string t))
+ (decompose string :compatibility t))
(defun string-to-nfc (string)
_N"Convert String to Unicode Normalization Form C (NFC). If the
=====================================
src/code/unicode.lisp
=====================================
@@ -517,3 +517,55 @@
(if (eq casing :simple)
(cl:string-capitalize string :start start :end end)
(string-capitalize-full string :start start :end end :casing casing))))
+
+
+(defun decompose-hangul-syllable (cp stream)
+ "Decompose the Hangul syllable codepoint CP to an equivalent sequence
+ of conjoining jamo and print the decomposed result to the stream
+ STREAM."
+ (let* ((s-base #xac00)
+ (l-base #x1100)
+ (v-base #x1161)
+ (t-base #x11a7)
+ (v-count 21)
+ (t-count 28)
+ (n-count (* v-count t-count)))
+ ;; Step 1: Compute index of the syllable S
+ (let ((s-index (- cp s-base)))
+ ;; Step 2: If s is in the range 0 <= s <= s-count, the compute
+ ;; the components.
+ (let ((l (+ l-base (truncate s-index n-count)))
+ (v (+ v-base (truncate (mod s-index n-count) t-count)))
+ (tt (+ t-base (mod s-index t-count))))
+ ;; Step 3: If tt = t-base, then there is no trailing character
+ ;; so replace s by the sequence <l,v>. Otherwise there is a
+ ;; trailing character, so replace s by the sequence <l,v,tt>.
+ (princ (code-char l) stream)
+ (princ (code-char v) stream)
+ (unless (= tt t-base)
+ (princ (code-char tt) stream)))))
+ (values))
+
+(defun is-hangul-syllable (codepoint)
+ "Test if CODEPOINT is a Hangul syllable"
+ (let* ((s-base #xac00)
+ (l-count 19)
+ (v-count 21)
+ (t-count 28)
+ (n-count (* v-count t-count))
+ (number-of-syllables (* l-count n-count)))
+ (<= 0 (- codepoint s-base) number-of-syllables)))
+
+(defun decompose-hangul (string)
+ "Decompose any Hangul syllables in STRING to an equivalent sequence of
+ conjoining jamo characters."
+ (with-output-to-string (s)
+ (loop for cp being the codepoints of string
+ do
+ (if (is-hangul-syllable cp)
+ (decompose-hangul-syllable cp s)
+ (multiple-value-bind (high low)
+ (surrogates cp)
+ (princ high s)
+ (when low
+ (princ low s)))))))
=====================================
src/i18n/locale/cmucl.pot
=====================================
@@ -4012,6 +4012,14 @@ msgid ""
" string is returned."
msgstr ""
+#: src/code/string.lisp
+msgid ""
+"Convert String to NFD (or NFKD). If :darwinp is non-NIL, then\n"
+" characters in the ranges U2000-U2FFF, UF900-UFA6A, and\n"
+" U2F800-U2FA1D are not decomposed, as specified for Darwin\n"
+" pathnames."
+msgstr ""
+
#: src/code/string.lisp
msgid ""
"Convert a sequence of codepoints to a string. Codepoints outside\n"
@@ -15271,6 +15279,23 @@ msgid ""
" delimited by non-case-modifiable chars. "
msgstr ""
+#: src/code/unicode.lisp
+msgid ""
+"Decompose the Hangul syllable codepoint CP to an equivalent sequence\n"
+" of conjoining jamo and print the decomposed result to the stream\n"
+" STREAM."
+msgstr ""
+
+#: src/code/unicode.lisp
+msgid "Test if CODEPOINT is a Hangul syllable"
+msgstr ""
+
+#: src/code/unicode.lisp
+msgid ""
+"Decompose any Hangul syllables in STRING to an equivalent sequence of\n"
+" conjoining jamo characters."
+msgstr ""
+
#: src/compiler/macros.lisp
msgid ""
"Policy Node Condition*\n"
=====================================
tests/issues.lisp
=====================================
@@ -832,6 +832,54 @@
+(define-test issue.158
+ (:tag :issues)
+ (let* ((name (string #\Hangul_Syllable_Gyek))
+ (path (make-pathname :directory (list :relative name)
+ :name name
+ :type name)))
+ ;; Enable this when we implement normalization for Darwin
+ #+(and nil darwin)
+ (let ((expected '(4352 4456 4543)))
+ ;; Tests that on Darwin the Hangul pathname has been normalized
+ ;; correctly. We fill in the directory, name, and type components
+ ;; with the same thing since it shouldn't really matter.
+ ;;
+ ;; The expected value is the conjoining jamo for the character
+ ;; #\Hangul_Syllable_Gyek.
+ (assert-equal (map 'list #'char-code (second (pathname-directory path)))
+ expected)
+ (assert-equal (map 'list #'char-code (pathname-name path))
+ expected)
+ (assert-equal (map 'list #'char-code (pathname-type path))
+ expected))
+ #-darwin
+ (let ((expected (list (char-code #\Hangul_Syllable_Gyek))))
+ ;; For other OSes, just assume that the pathname is unchanged.
+ (assert-equal (map 'list #'char-code (second (pathname-directory path)))
+ expected)
+ (assert-equal (map 'list #'char-code (pathname-name path))
+ expected)
+ (assert-equal (map 'list #'char-code (pathname-type path))
+ expected))))
+
+(define-test issue.158.dir
+ (:tag :issues)
+ (flet ((get-file ()
+ ;; This assumes that there is only one file in resources/darwin
+ (let ((files (directory (merge-pathnames "resources/darwin/*.txt" *test-path*))))
+ (assert-equal (length files) 1)
+ (first files))))
+ (let ((f (get-file))
+ (expected-name "안녕하십니까"))
+ #+darwin
+ (assert-equal (pathname-name f)
+ (unicode::decompose-hangul expected-name))
+ #-darwin
+ (assert-equal (pathname-name f) expected-name))))
+
+
+
(define-test issue.166
(:tag :issues)
;; While this tests for the correct return value, the problem was
@@ -896,4 +944,3 @@
(assert-true (typep idf-max-expo 'kernel:double-float-int-exponent))
(assert-true (typep (1- idf-max-expo) 'kernel:double-float-int-exponent))
(assert-false (typep (1+ idf-max-expo) 'kernel:double-float-int-exponent))))
-
=====================================
tests/resources/darwin/안녕하십니까.txt
=====================================
@@ -0,0 +1,3 @@
+The file name of this file is "안녕하십니까.txt" ("Hello" in Korean.)
+
+
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/c26f8ede5b0828a2db337fc372ab09d7b49fd5e1...3578e01505f9afc9272d6575c864ef632c57e6b7
--
View it on GitLab: https://gitlab.common-lisp.net/cmucl/cmucl/-/compare/c26f8ede5b0828a2db337fc372ab09d7b49fd5e1...3578e01505f9afc9272d6575c864ef632c57e6b7
You're receiving this email because of your account on gitlab.common-lisp.net.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://mailman.common-lisp.net/pipermail/cmucl-cvs/attachments/20230324/e02744c2/attachment-0001.html>
More information about the cmucl-cvs
mailing list