[s-xml-devel] HTML entities

Gustavo Milaré gugamilare at gmail.com
Sun Feb 3 03:58:28 UTC 2013


Hello,

I am using S-XML to parse some HTML code and stumbled upon the problem 
that S-XML doesn't support HTML entities (like © and &eecirc;). 
I've created a patch that solves that problem, in anyone is interested.

Regards,
Gustavo
-------------- next part --------------
Index: src/package.lisp
===================================================================
RCS file: /project/s-xml/cvsroot/s-xml/src/package.lisp,v
retrieving revision 1.8
diff -u -r1.8 package.lisp
--- src/package.lisp	31 Jan 2006 11:44:15 -0000	1.8
+++ src/package.lisp	3 Feb 2013 03:49:55 -0000
@@ -23,6 +23,7 @@
    #:xml-parser-error #:xml-parser-error-message #:xml-parser-error-args #:xml-parser-error-stream
    #:xml-parser-state #:get-entities #:get-seed
    #:get-new-element-hook #:get-finish-element-hook #:get-text-hook
+   #:make-standard-entities #:make-html-entities
    ;; callbacks
    #:*attribute-name-parser*
    #:*attribute-value-parser*
@@ -39,7 +40,9 @@
    #:*ignore-namespaces* #:*local-namespace* #:*namespaces*
    #:*require-existing-symbols* #:*auto-export-symbols* #:*auto-create-namespace-packages*
    #:find-namespace #:register-namespace #:get-prefix #:get-uri #:get-package
-   #:resolve-identifier #:extend-namespaces #:print-identifier #:split-identifier)
+   #:resolve-identifier #:extend-namespaces #:print-identifier #:split-identifier
+   ;; options
+   #:*html-compatibility-mode*)
   (:documentation 
    "A simple XML parser with an efficient, purely functional, event-based interface as well as a DOM interface"))
 
Index: src/xml.lisp
===================================================================
RCS file: /project/s-xml/cvsroot/s-xml/src/xml.lisp,v
retrieving revision 1.16
diff -u -r1.16 xml.lisp
--- src/xml.lisp	31 Jan 2006 11:44:15 -0000	1.16
+++ src/xml.lisp	3 Feb 2013 03:49:56 -0000
@@ -124,6 +124,9 @@
 		    (write (char-code char) :stream stream :base 16)
 		    (write-char #\; stream)))))))
 
+(defvar *html-compatibility-mode* nil
+  "If non-nil, S-XML will be able to parse HTML entities")
+
 (defun make-standard-entities ()
   "A hashtable mapping XML entity names to their replacement strings,
   filled with the standard set"
@@ -136,6 +139,265 @@
 	  (gethash "nbsp" entities) (string #\space))
     entities))
 
+(defun make-html-entities ()
+  "A hashtable mapping HTML entity names to their replacement strings,
+  filled with the standard set"
+  (let ((entities (make-hash-table :test #'equal)))
+    (setf (gethash "amp" entities) (string #\&)
+          (gethash "quot" entities) (string #\")
+          (gethash "apos" entities) (string #\')
+          (gethash "lt" entities) (string #\<)
+          (gethash "gt" entities) (string #\>)
+          (gethash "nbsp" entities) (string #\space)
+          (gethash "iexcl" entities) (string (code-char 161))
+          (gethash "cent" entities) (string (code-char 162))
+          (gethash "pound" entities) (string (code-char 163))
+          (gethash "curren" entities) (string (code-char 164))
+          (gethash "yen" entities) (string (code-char 165))
+          (gethash "brvbar" entities) (string (code-char 166))
+          (gethash "sect" entities) (string (code-char 167))
+          (gethash "uml" entities) (string (code-char 168))
+          (gethash "copy" entities) (string (code-char 169))
+          (gethash "ordf" entities) (string (code-char 170))
+          (gethash "laquo" entities) (string (code-char 171))
+          (gethash "not" entities) (string (code-char 172))
+          (gethash "shy" entities) (string (code-char 173))
+          (gethash "reg" entities) (string (code-char 174))
+          (gethash "macr" entities) (string (code-char 175))
+          (gethash "deg" entities) (string (code-char 176))
+          (gethash "plusmn" entities) (string (code-char 177))
+          (gethash "sup2" entities) (string (code-char 178))
+          (gethash "sup3" entities) (string (code-char 179))
+          (gethash "acute" entities) (string (code-char 180))
+          (gethash "micro" entities) (string (code-char 181))
+          (gethash "para" entities) (string (code-char 182))
+          (gethash "middot" entities) (string (code-char 183))
+          (gethash "cedil" entities) (string (code-char 184))
+          (gethash "sup1" entities) (string (code-char 185))
+          (gethash "ordm" entities) (string (code-char 186))
+          (gethash "raquo" entities) (string (code-char 187))
+          (gethash "frac14" entities) (string (code-char 188))
+          (gethash "frac12" entities) (string (code-char 189))
+          (gethash "frac34" entities) (string (code-char 190))
+          (gethash "iquest" entities) (string (code-char 191))
+          (gethash "Agrave" entities) (string (code-char 192))
+          (gethash "Aacute" entities) (string (code-char 193))
+          (gethash "Acirc" entities) (string (code-char 194))
+          (gethash "Atilde" entities) (string (code-char 195))
+          (gethash "Auml" entities) (string (code-char 196))
+          (gethash "Aring" entities) (string (code-char 197))
+          (gethash "AElig" entities) (string (code-char 198))
+          (gethash "Ccedil" entities) (string (code-char 199))
+          (gethash "Egrave" entities) (string (code-char 200))
+          (gethash "Eacute" entities) (string (code-char 201))
+          (gethash "Ecirc" entities) (string (code-char 202))
+          (gethash "Euml" entities) (string (code-char 203))
+          (gethash "Igrave" entities) (string (code-char 204))
+          (gethash "Iacute" entities) (string (code-char 205))
+          (gethash "Icirc" entities) (string (code-char 206))
+          (gethash "Iuml" entities) (string (code-char 207))
+          (gethash "ETH" entities) (string (code-char 208))
+          (gethash "Ntilde" entities) (string (code-char 209))
+          (gethash "Ograve" entities) (string (code-char 210))
+          (gethash "Oacute" entities) (string (code-char 211))
+          (gethash "Ocirc" entities) (string (code-char 212))
+          (gethash "Otilde" entities) (string (code-char 213))
+          (gethash "Ouml" entities) (string (code-char 214))
+          (gethash "times" entities) (string (code-char 215))
+          (gethash "Oslash" entities) (string (code-char 216))
+          (gethash "Ugrave" entities) (string (code-char 217))
+          (gethash "Uacute" entities) (string (code-char 218))
+          (gethash "Ucirc" entities) (string (code-char 219))
+          (gethash "Uuml" entities) (string (code-char 220))
+          (gethash "Yacute" entities) (string (code-char 221))
+          (gethash "THORN" entities) (string (code-char 222))
+          (gethash "szlig" entities) (string (code-char 223))
+          (gethash "agrave" entities) (string (code-char 224))
+          (gethash "aacute" entities) (string (code-char 225))
+          (gethash "acirc" entities) (string (code-char 226))
+          (gethash "atilde" entities) (string (code-char 227))
+          (gethash "auml" entities) (string (code-char 228))
+          (gethash "aring" entities) (string (code-char 229))
+          (gethash "aelig" entities) (string (code-char 230))
+          (gethash "ccedil" entities) (string (code-char 231))
+          (gethash "egrave" entities) (string (code-char 232))
+          (gethash "eacute" entities) (string (code-char 233))
+          (gethash "ecirc" entities) (string (code-char 234))
+          (gethash "euml" entities) (string (code-char 235))
+          (gethash "igrave" entities) (string (code-char 236))
+          (gethash "iacute" entities) (string (code-char 237))
+          (gethash "icirc" entities) (string (code-char 238))
+          (gethash "iuml" entities) (string (code-char 239))
+          (gethash "eth" entities) (string (code-char 240))
+          (gethash "ntilde" entities) (string (code-char 241))
+          (gethash "ograve" entities) (string (code-char 242))
+          (gethash "oacute" entities) (string (code-char 243))
+          (gethash "ocirc" entities) (string (code-char 244))
+          (gethash "otilde" entities) (string (code-char 245))
+          (gethash "ouml" entities) (string (code-char 246))
+          (gethash "divide" entities) (string (code-char 247))
+          (gethash "oslash" entities) (string (code-char 248))
+          (gethash "ugrave" entities) (string (code-char 249))
+          (gethash "uacute" entities) (string (code-char 250))
+          (gethash "ucirc" entities) (string (code-char 251))
+          (gethash "uuml" entities) (string (code-char 252))
+          (gethash "yacute" entities) (string (code-char 253))
+          (gethash "thorn" entities) (string (code-char 254))
+          (gethash "yuml" entities) (string (code-char 255))
+          (gethash "OElig" entities) (string (code-char 338))
+          (gethash "oelig" entities) (string (code-char 339))
+          (gethash "Scaron" entities) (string (code-char 352))
+          (gethash "scaron" entities) (string (code-char 353))
+          (gethash "Yuml" entities) (string (code-char 376))
+          (gethash "fnof" entities) (string (code-char 402))
+          (gethash "circ" entities) (string (code-char 710))
+          (gethash "tilde" entities) (string (code-char 732))
+          (gethash "Alpha" entities) (string (code-char 913))
+          (gethash "Beta" entities) (string (code-char 914))
+          (gethash "Gamma" entities) (string (code-char 915))
+          (gethash "Delta" entities) (string (code-char 916))
+          (gethash "Epsilon" entities) (string (code-char 917))
+          (gethash "Zeta" entities) (string (code-char 918))
+          (gethash "Eta" entities) (string (code-char 919))
+          (gethash "Theta" entities) (string (code-char 920))
+          (gethash "Iota" entities) (string (code-char 921))
+          (gethash "Kappa" entities) (string (code-char 922))
+          (gethash "Lambda" entities) (string (code-char 923))
+          (gethash "Mu" entities) (string (code-char 924))
+          (gethash "Nu" entities) (string (code-char 925))
+          (gethash "Xi" entities) (string (code-char 926))
+          (gethash "Omicron" entities) (string (code-char 927))
+          (gethash "Pi" entities) (string (code-char 928))
+          (gethash "Rho" entities) (string (code-char 929))
+          (gethash "Sigma" entities) (string (code-char 931))
+          (gethash "Tau" entities) (string (code-char 932))
+          (gethash "Upsilon" entities) (string (code-char 933))
+          (gethash "Phi" entities) (string (code-char 934))
+          (gethash "Chi" entities) (string (code-char 935))
+          (gethash "Psi" entities) (string (code-char 936))
+          (gethash "Omega" entities) (string (code-char 937))
+          (gethash "alpha" entities) (string (code-char 945))
+          (gethash "beta" entities) (string (code-char 946))
+          (gethash "gamma" entities) (string (code-char 947))
+          (gethash "delta" entities) (string (code-char 948))
+          (gethash "epsilon" entities) (string (code-char 949))
+          (gethash "zeta" entities) (string (code-char 950))
+          (gethash "eta" entities) (string (code-char 951))
+          (gethash "theta" entities) (string (code-char 952))
+          (gethash "iota" entities) (string (code-char 953))
+          (gethash "kappa" entities) (string (code-char 954))
+          (gethash "lambda" entities) (string (code-char 955))
+          (gethash "mu" entities) (string (code-char 956))
+          (gethash "nu" entities) (string (code-char 957))
+          (gethash "xi" entities) (string (code-char 958))
+          (gethash "omicron" entities) (string (code-char 959))
+          (gethash "pi" entities) (string (code-char 960))
+          (gethash "rho" entities) (string (code-char 961))
+          (gethash "sigmaf" entities) (string (code-char 962))
+          (gethash "sigma" entities) (string (code-char 963))
+          (gethash "tau" entities) (string (code-char 964))
+          (gethash "upsilon" entities) (string (code-char 965))
+          (gethash "phi" entities) (string (code-char 966))
+          (gethash "chi" entities) (string (code-char 967))
+          (gethash "psi" entities) (string (code-char 968))
+          (gethash "omega" entities) (string (code-char 969))
+          (gethash "thetasym" entities) (string (code-char 977))
+          (gethash "upsih" entities) (string (code-char 978))
+          (gethash "piv" entities) (string (code-char 982))
+          (gethash "ensp" entities) (string (code-char 8194))
+          (gethash "emsp" entities) (string (code-char 8195))
+          (gethash "thinsp" entities) (string (code-char 8201))
+          (gethash "zwnj" entities) (string (code-char 8204))
+          (gethash "zwj" entities) (string (code-char 8205))
+          (gethash "lrm" entities) (string (code-char 8206))
+          (gethash "rlm" entities) (string (code-char 8207))
+          (gethash "ndash" entities) (string (code-char 8211))
+          (gethash "mdash" entities) (string (code-char 8212))
+          (gethash "lsquo" entities) (string (code-char 8216))
+          (gethash "rsquo" entities) (string (code-char 8217))
+          (gethash "sbquo" entities) (string (code-char 8218))
+          (gethash "ldquo" entities) (string (code-char 8220))
+          (gethash "rdquo" entities) (string (code-char 8221))
+          (gethash "bdquo" entities) (string (code-char 8222))
+          (gethash "dagger" entities) (string (code-char 8224))
+          (gethash "Dagger" entities) (string (code-char 8225))
+          (gethash "bull" entities) (string (code-char 8226))
+          (gethash "hellip" entities) (string (code-char 8230))
+          (gethash "permil" entities) (string (code-char 8240))
+          (gethash "prime" entities) (string (code-char 8242))
+          (gethash "Prime" entities) (string (code-char 8243))
+          (gethash "lsaquo" entities) (string (code-char 8249))
+          (gethash "rsaquo" entities) (string (code-char 8250))
+          (gethash "oline" entities) (string (code-char 8254))
+          (gethash "frasl" entities) (string (code-char 8260))
+          (gethash "euro" entities) (string (code-char 8364))
+          (gethash "image" entities) (string (code-char 8465))
+          (gethash "weierp" entities) (string (code-char 8472))
+          (gethash "real" entities) (string (code-char 8476))
+          (gethash "trade" entities) (string (code-char 8482))
+          (gethash "alefsym" entities) (string (code-char 8501))
+          (gethash "larr" entities) (string (code-char 8592))
+          (gethash "uarr" entities) (string (code-char 8593))
+          (gethash "rarr" entities) (string (code-char 8594))
+          (gethash "darr" entities) (string (code-char 8595))
+          (gethash "harr" entities) (string (code-char 8596))
+          (gethash "crarr" entities) (string (code-char 8629))
+          (gethash "lArr" entities) (string (code-char 8656))
+          (gethash "uArr" entities) (string (code-char 8657))
+          (gethash "rArr" entities) (string (code-char 8658))
+          (gethash "dArr" entities) (string (code-char 8659))
+          (gethash "hArr" entities) (string (code-char 8660))
+          (gethash "forall" entities) (string (code-char 8704))
+          (gethash "part" entities) (string (code-char 8706))
+          (gethash "exist" entities) (string (code-char 8707))
+          (gethash "empty" entities) (string (code-char 8709))
+          (gethash "nabla" entities) (string (code-char 8711))
+          (gethash "isin" entities) (string (code-char 8712))
+          (gethash "notin" entities) (string (code-char 8713))
+          (gethash "ni" entities) (string (code-char 8715))
+          (gethash "prod" entities) (string (code-char 8719))
+          (gethash "sum" entities) (string (code-char 8721))
+          (gethash "minus" entities) (string (code-char 8722))
+          (gethash "lowast" entities) (string (code-char 8727))
+          (gethash "radic" entities) (string (code-char 8730))
+          (gethash "prop" entities) (string (code-char 8733))
+          (gethash "infin" entities) (string (code-char 8734))
+          (gethash "ang" entities) (string (code-char 8736))
+          (gethash "and" entities) (string (code-char 8743))
+          (gethash "or" entities) (string (code-char 8744))
+          (gethash "cap" entities) (string (code-char 8745))
+          (gethash "cup" entities) (string (code-char 8746))
+          (gethash "int" entities) (string (code-char 8747))
+          (gethash "there4" entities) (string (code-char 8756))
+          (gethash "sim" entities) (string (code-char 8764))
+          (gethash "cong" entities) (string (code-char 8773))
+          (gethash "asymp" entities) (string (code-char 8776))
+          (gethash "ne" entities) (string (code-char 8800))
+          (gethash "equiv" entities) (string (code-char 8801))
+          (gethash "le" entities) (string (code-char 8804))
+          (gethash "ge" entities) (string (code-char 8805))
+          (gethash "sub" entities) (string (code-char 8834))
+          (gethash "sup" entities) (string (code-char 8835))
+          (gethash "nsub" entities) (string (code-char 8836))
+          (gethash "sube" entities) (string (code-char 8838))
+          (gethash "supe" entities) (string (code-char 8839))
+          (gethash "oplus" entities) (string (code-char 8853))
+          (gethash "otimes" entities) (string (code-char 8855))
+          (gethash "perp" entities) (string (code-char 8869))
+          (gethash "sdot" entities) (string (code-char 8901))
+          (gethash "lceil" entities) (string (code-char 8968))
+          (gethash "rceil" entities) (string (code-char 8969))
+          (gethash "lfloor" entities) (string (code-char 8970))
+          (gethash "rfloor" entities) (string (code-char 8971))
+          (gethash "lang" entities) (string (code-char 9001))
+          (gethash "rang" entities) (string (code-char 9002))
+          (gethash "loz" entities) (string (code-char 9674))
+          (gethash "spades" entities) (string (code-char 9824))
+          (gethash "clubs" entities) (string (code-char 9827))
+          (gethash "hearts" entities) (string (code-char 9829))
+          (gethash "diams" entities) (string (code-char 9830)))
+    entities))
+
 (defun resolve-entity (stream extendable-string entities entity)
   "Read and resolve an XML entity from stream, positioned after the '&' entity marker, 
   accepting &name; &#DEC; and &#xHEX; formats,
@@ -341,7 +603,9 @@
   ((entities :documentation "A hashtable mapping XML entity names to their replacement stings"
 	     :accessor get-entities
 	     :initarg :entities
-	     :initform (make-standard-entities))
+	     :initform (if *html-compatibility-mode*
+                           (make-html-entities)
+                           (make-standard-entities)))
    (seed :documentation "The user seed object"
 	 :accessor get-seed
 	 :initarg :seed


More information about the s-xml-devel mailing list