[cl-ppcre-devel] problem (bug?) with do-register-groups

tichy ssbm2 at o2.pl
Tue Dec 6 10:56:53 UTC 2005


#|

Hi.

I think there is a bug in cl-ppcre (do-register-groups). I wrote
two almost identical versions of function
'dtd-attributes-to-sexp'.  One with 'do-register-groups' hangs,
another which uses 'scan' works.


example:

(dtd-attributes-to-sexp
  "attr-foo (foo | bar) \"foo\" attr-bar CDATA #FIXED 'zonk' attr-baz NMTOKEN #REQUIRED")

result without 'read-dtd-parens':

(("attr-foo"    "(foo | bar)"  "foo")
  ("attr-bar"    :CDATA         (:FIXED "zonk"))
  ("attr-baz"    :NMTOKEN       :REQUIRED))


result with 'read-dtd-parens':

(("attr-foo" (:ALTERNATION 1 "foo" "bar") "foo")
  ("attr-bar" :CDATA                       (:FIXED "zonk"))
  ("attr-baz" :NMTOKEN                     :REQUIRED))

|#

(use-package :cl-ppcre)

(defparameter *dtd-attribute-name-regex*
   "([a-zA-z0-9._\\-:]+)")

(defparameter *dtd-attribute-type-regex*
   "(?:(\\(.*?\\))|(CDATA)|(ID)|(IDREF)|(IDREFS)|(NMTOKEN)|(NMTOKENS)|(ENTITY)|(ENTITIES)|(NOTATION)|(xml:\\w+))")

(defparameter *dtd-attribute-default-value-regex*
   "(?:('[^<>\"']*')|(\"[^<>\"]*\")|(#REQUIRED)|(#IMPLIED)|(?:#FIXED\\s+(?:('[^<>\"']*')|(\"[^<>\"]*\"))))")

;;; ---------- buggy 'dtd-attributes-to-sexp' ----------

;; Version with 'do-register-groups' macro. It works only without call
;; to 'read-dtd-parens' (which 'read-dtd...' uses 'cl-ppcre:scan').
;; 'read-dtd-parens' itself seems to be ok.
;; I think 'do-register-groups' has a bug.
;;
(defun dtd-attributes-to-sexp (target &aux result)
   (do-register-groups
    (a b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 c1 c2 c3 c4 c5.1 c5.2)
    ((format nil "\\s*~A\\s+~A\\s+~A\\s*" ; no load-time-value beause it's an example.
             *dtd-attribute-name-regex*
             *dtd-attribute-type-regex*
             *dtd-attribute-default-value-regex*)
     target (nreverse result))
    (push (list a
                (cond (b1   (read-dtd-parens b1)) #| *** !!!!!! *** |#
                      (b2   :cdata)
                      (b3   :id)
                      (b4   :idref)
                      (b5   :idrefs)
                      (b6   :nmtoken)
                      (b7   :nmtokens)
                      (b8   :entity)
                      (b9   :entities)
                      (b10  :notation)
                      (b11  b11))
                (cond (c1   (subseq c1 1 (1- (length c1))))
                      (c2   (subseq c2 1 (1- (length c2))))
                      (c3   :required)
                      (c4   :implied)
                      (c5.1 (list :fixed (subseq c5.1 1 (1- (length c5.1)))))
                      (c5.2 (list :fixed (subseq c5.2 1 (1- (length c5.2)))))))
          result)))


;;; ---------- 'dtd-attributes-to-sexp' that works ----------

;; Version without 'do-register-groups', i used 'scan' instead of it.
;; It works as intended.
;;
(defun dtd-attributes-to-sexp (string &aux result (start 0))
   (loop
    (multiple-value-bind (m-start m-end r-start r-end)
        (scan (format nil "\\s*~A\\s+~A\\s+~A\\s*" ; no load-time-value beause it's an example.
                      *dtd-attribute-name-regex*
                      *dtd-attribute-type-regex*
                      *dtd-attribute-default-value-regex*)
              string
              :start start)
      (declare (ignore m-start))
      (push (list (subseq string (svref r-start 0) (svref r-end 0))
                  (case (position-if #'identity r-start :start 1)
                    (1   (read-dtd-parens (subseq string (svref r-start 1) (svref r-end 1))))
                    (2   :cdata)
                    (3   :id)
                    (4   :idref)
                    (5   :idrefs)
                    (6   :nmtoken)
                    (7   :nmtokens)
                    (8   :entity)
                    (9   :entities)
                    (10  :notation)
                    (11  (subseq string (svref r-start 11) (svref r-end 11))))
                  (case (position-if #'identity r-start :start 12)
                    (12  (subseq string (1+ (svref r-start 12)) (1- (svref r-end 12))))
                    (13  (subseq string (1+ (svref r-start 13)) (1- (svref r-end 13))))
                    (14  :required)
                    (15  :implied)
                    (16  (list :fixed (subseq string (1+ (svref r-start 16)) (1- (svref r-end 16)))))
                    (17  (list :fixed (subseq string (1+ (svref r-start 17)) (1- (svref r-end 17)))))))
            result)
      (when (eql m-end (length string))
        (return (nreverse result)))
      (setq start m-end))))


;; This function is meant to be a pareser to fragments of dtd's
;; 'element' and "attribute' tags.
;; for example:
;;
;; (read-dtd-parens "(e | f)")
;;
;; ===> (:ALTERNATION 1 "e" "f")
;;
;; or
;;
;; (read-dtd-parens "((script|style|meta|link|object)*, ((title, (script|style|meta|link|object)*, (base, (script|style|meta|link|object)*)?) | (base, (script|style|meta|link|object)*, (title, (script|style|meta|link|object)*))))")
;;
;; ===> big sexp :)
;;
(defun read-dtd-parens (string
                         &optional (start 0)
                         &aux match-begin match-end result reg type stuff (index (1+ start)))
   (loop (cond ((char= #\( (char string index))
                (multiple-value-setq (stuff index) (read-dtd-parens string index))
                (push stuff result))
               ((multiple-value-setq (match-begin match-end)
                  (scan (load-time-value (create-scanner "^\\w+") t) string :start index))
                (setq index match-end)
                (if (find (char string match-end) "+*?")
                    (progn
                      (push (list (intern (string (char string match-end)))
                                  (subseq string match-begin match-end))
                            result)
                      (incf index))
                  (push (subseq string match-begin match-end) result)))
               ((multiple-value-setq (match-begin match-end reg)
                  (scan (case type
                          (:alternation
                           (load-time-value (create-scanner "^\\s*\\|\\s*") t))
                          (:sequence
                           (load-time-value (create-scanner "^\\s*,\\s*") t))
                          ('nil
                           (load-time-value (create-scanner "^\\s*(?:(\\|)|(,))\\s*") t)))
                        string :start index))
                (unless type (setq type (if (svref reg 0) :alternation :sequence)))
                (setq index match-end))
               ((char= #\) (char string index))
                (setq result (nreverse result))
                (return (case (if (array-in-bounds-p string (1+ index))
                                  (char string (1+ index)))
                          (#\* (values (cons type (cons '* result)) (+ 2 index)))
                          (#\? (values (cons type (cons '? result)) (+ 2 index)))
                          (#\+ (values (cons type (cons '+ result)) (+ 2 index)))
                          (t   (values (cons type (cons '1 result)) (+ 1 index)))))))))

;;; ---------- END.



More information about the Cl-ppcre-devel mailing list