[s-xml-devel] HTML entities
Gustavo Milaré
gugamilare at gmail.com
Sun Feb 3 03:58:28 UTC 2013
Hello,
I am using S-XML to parse some HTML code and stumbled upon the problem
that S-XML doesn't support HTML entities (like © and &eecirc;).
I've created a patch that solves that problem, in anyone is interested.
Regards,
Gustavo
-------------- next part --------------
Index: src/package.lisp
===================================================================
RCS file: /project/s-xml/cvsroot/s-xml/src/package.lisp,v
retrieving revision 1.8
diff -u -r1.8 package.lisp
--- src/package.lisp 31 Jan 2006 11:44:15 -0000 1.8
+++ src/package.lisp 3 Feb 2013 03:49:55 -0000
@@ -23,6 +23,7 @@
#:xml-parser-error #:xml-parser-error-message #:xml-parser-error-args #:xml-parser-error-stream
#:xml-parser-state #:get-entities #:get-seed
#:get-new-element-hook #:get-finish-element-hook #:get-text-hook
+ #:make-standard-entities #:make-html-entities
;; callbacks
#:*attribute-name-parser*
#:*attribute-value-parser*
@@ -39,7 +40,9 @@
#:*ignore-namespaces* #:*local-namespace* #:*namespaces*
#:*require-existing-symbols* #:*auto-export-symbols* #:*auto-create-namespace-packages*
#:find-namespace #:register-namespace #:get-prefix #:get-uri #:get-package
- #:resolve-identifier #:extend-namespaces #:print-identifier #:split-identifier)
+ #:resolve-identifier #:extend-namespaces #:print-identifier #:split-identifier
+ ;; options
+ #:*html-compatibility-mode*)
(:documentation
"A simple XML parser with an efficient, purely functional, event-based interface as well as a DOM interface"))
Index: src/xml.lisp
===================================================================
RCS file: /project/s-xml/cvsroot/s-xml/src/xml.lisp,v
retrieving revision 1.16
diff -u -r1.16 xml.lisp
--- src/xml.lisp 31 Jan 2006 11:44:15 -0000 1.16
+++ src/xml.lisp 3 Feb 2013 03:49:56 -0000
@@ -124,6 +124,9 @@
(write (char-code char) :stream stream :base 16)
(write-char #\; stream)))))))
+(defvar *html-compatibility-mode* nil
+ "If non-nil, S-XML will be able to parse HTML entities")
+
(defun make-standard-entities ()
"A hashtable mapping XML entity names to their replacement strings,
filled with the standard set"
@@ -136,6 +139,265 @@
(gethash "nbsp" entities) (string #\space))
entities))
+(defun make-html-entities ()
+ "A hashtable mapping HTML entity names to their replacement strings,
+ filled with the standard set"
+ (let ((entities (make-hash-table :test #'equal)))
+ (setf (gethash "amp" entities) (string #\&)
+ (gethash "quot" entities) (string #\")
+ (gethash "apos" entities) (string #\')
+ (gethash "lt" entities) (string #\<)
+ (gethash "gt" entities) (string #\>)
+ (gethash "nbsp" entities) (string #\space)
+ (gethash "iexcl" entities) (string (code-char 161))
+ (gethash "cent" entities) (string (code-char 162))
+ (gethash "pound" entities) (string (code-char 163))
+ (gethash "curren" entities) (string (code-char 164))
+ (gethash "yen" entities) (string (code-char 165))
+ (gethash "brvbar" entities) (string (code-char 166))
+ (gethash "sect" entities) (string (code-char 167))
+ (gethash "uml" entities) (string (code-char 168))
+ (gethash "copy" entities) (string (code-char 169))
+ (gethash "ordf" entities) (string (code-char 170))
+ (gethash "laquo" entities) (string (code-char 171))
+ (gethash "not" entities) (string (code-char 172))
+ (gethash "shy" entities) (string (code-char 173))
+ (gethash "reg" entities) (string (code-char 174))
+ (gethash "macr" entities) (string (code-char 175))
+ (gethash "deg" entities) (string (code-char 176))
+ (gethash "plusmn" entities) (string (code-char 177))
+ (gethash "sup2" entities) (string (code-char 178))
+ (gethash "sup3" entities) (string (code-char 179))
+ (gethash "acute" entities) (string (code-char 180))
+ (gethash "micro" entities) (string (code-char 181))
+ (gethash "para" entities) (string (code-char 182))
+ (gethash "middot" entities) (string (code-char 183))
+ (gethash "cedil" entities) (string (code-char 184))
+ (gethash "sup1" entities) (string (code-char 185))
+ (gethash "ordm" entities) (string (code-char 186))
+ (gethash "raquo" entities) (string (code-char 187))
+ (gethash "frac14" entities) (string (code-char 188))
+ (gethash "frac12" entities) (string (code-char 189))
+ (gethash "frac34" entities) (string (code-char 190))
+ (gethash "iquest" entities) (string (code-char 191))
+ (gethash "Agrave" entities) (string (code-char 192))
+ (gethash "Aacute" entities) (string (code-char 193))
+ (gethash "Acirc" entities) (string (code-char 194))
+ (gethash "Atilde" entities) (string (code-char 195))
+ (gethash "Auml" entities) (string (code-char 196))
+ (gethash "Aring" entities) (string (code-char 197))
+ (gethash "AElig" entities) (string (code-char 198))
+ (gethash "Ccedil" entities) (string (code-char 199))
+ (gethash "Egrave" entities) (string (code-char 200))
+ (gethash "Eacute" entities) (string (code-char 201))
+ (gethash "Ecirc" entities) (string (code-char 202))
+ (gethash "Euml" entities) (string (code-char 203))
+ (gethash "Igrave" entities) (string (code-char 204))
+ (gethash "Iacute" entities) (string (code-char 205))
+ (gethash "Icirc" entities) (string (code-char 206))
+ (gethash "Iuml" entities) (string (code-char 207))
+ (gethash "ETH" entities) (string (code-char 208))
+ (gethash "Ntilde" entities) (string (code-char 209))
+ (gethash "Ograve" entities) (string (code-char 210))
+ (gethash "Oacute" entities) (string (code-char 211))
+ (gethash "Ocirc" entities) (string (code-char 212))
+ (gethash "Otilde" entities) (string (code-char 213))
+ (gethash "Ouml" entities) (string (code-char 214))
+ (gethash "times" entities) (string (code-char 215))
+ (gethash "Oslash" entities) (string (code-char 216))
+ (gethash "Ugrave" entities) (string (code-char 217))
+ (gethash "Uacute" entities) (string (code-char 218))
+ (gethash "Ucirc" entities) (string (code-char 219))
+ (gethash "Uuml" entities) (string (code-char 220))
+ (gethash "Yacute" entities) (string (code-char 221))
+ (gethash "THORN" entities) (string (code-char 222))
+ (gethash "szlig" entities) (string (code-char 223))
+ (gethash "agrave" entities) (string (code-char 224))
+ (gethash "aacute" entities) (string (code-char 225))
+ (gethash "acirc" entities) (string (code-char 226))
+ (gethash "atilde" entities) (string (code-char 227))
+ (gethash "auml" entities) (string (code-char 228))
+ (gethash "aring" entities) (string (code-char 229))
+ (gethash "aelig" entities) (string (code-char 230))
+ (gethash "ccedil" entities) (string (code-char 231))
+ (gethash "egrave" entities) (string (code-char 232))
+ (gethash "eacute" entities) (string (code-char 233))
+ (gethash "ecirc" entities) (string (code-char 234))
+ (gethash "euml" entities) (string (code-char 235))
+ (gethash "igrave" entities) (string (code-char 236))
+ (gethash "iacute" entities) (string (code-char 237))
+ (gethash "icirc" entities) (string (code-char 238))
+ (gethash "iuml" entities) (string (code-char 239))
+ (gethash "eth" entities) (string (code-char 240))
+ (gethash "ntilde" entities) (string (code-char 241))
+ (gethash "ograve" entities) (string (code-char 242))
+ (gethash "oacute" entities) (string (code-char 243))
+ (gethash "ocirc" entities) (string (code-char 244))
+ (gethash "otilde" entities) (string (code-char 245))
+ (gethash "ouml" entities) (string (code-char 246))
+ (gethash "divide" entities) (string (code-char 247))
+ (gethash "oslash" entities) (string (code-char 248))
+ (gethash "ugrave" entities) (string (code-char 249))
+ (gethash "uacute" entities) (string (code-char 250))
+ (gethash "ucirc" entities) (string (code-char 251))
+ (gethash "uuml" entities) (string (code-char 252))
+ (gethash "yacute" entities) (string (code-char 253))
+ (gethash "thorn" entities) (string (code-char 254))
+ (gethash "yuml" entities) (string (code-char 255))
+ (gethash "OElig" entities) (string (code-char 338))
+ (gethash "oelig" entities) (string (code-char 339))
+ (gethash "Scaron" entities) (string (code-char 352))
+ (gethash "scaron" entities) (string (code-char 353))
+ (gethash "Yuml" entities) (string (code-char 376))
+ (gethash "fnof" entities) (string (code-char 402))
+ (gethash "circ" entities) (string (code-char 710))
+ (gethash "tilde" entities) (string (code-char 732))
+ (gethash "Alpha" entities) (string (code-char 913))
+ (gethash "Beta" entities) (string (code-char 914))
+ (gethash "Gamma" entities) (string (code-char 915))
+ (gethash "Delta" entities) (string (code-char 916))
+ (gethash "Epsilon" entities) (string (code-char 917))
+ (gethash "Zeta" entities) (string (code-char 918))
+ (gethash "Eta" entities) (string (code-char 919))
+ (gethash "Theta" entities) (string (code-char 920))
+ (gethash "Iota" entities) (string (code-char 921))
+ (gethash "Kappa" entities) (string (code-char 922))
+ (gethash "Lambda" entities) (string (code-char 923))
+ (gethash "Mu" entities) (string (code-char 924))
+ (gethash "Nu" entities) (string (code-char 925))
+ (gethash "Xi" entities) (string (code-char 926))
+ (gethash "Omicron" entities) (string (code-char 927))
+ (gethash "Pi" entities) (string (code-char 928))
+ (gethash "Rho" entities) (string (code-char 929))
+ (gethash "Sigma" entities) (string (code-char 931))
+ (gethash "Tau" entities) (string (code-char 932))
+ (gethash "Upsilon" entities) (string (code-char 933))
+ (gethash "Phi" entities) (string (code-char 934))
+ (gethash "Chi" entities) (string (code-char 935))
+ (gethash "Psi" entities) (string (code-char 936))
+ (gethash "Omega" entities) (string (code-char 937))
+ (gethash "alpha" entities) (string (code-char 945))
+ (gethash "beta" entities) (string (code-char 946))
+ (gethash "gamma" entities) (string (code-char 947))
+ (gethash "delta" entities) (string (code-char 948))
+ (gethash "epsilon" entities) (string (code-char 949))
+ (gethash "zeta" entities) (string (code-char 950))
+ (gethash "eta" entities) (string (code-char 951))
+ (gethash "theta" entities) (string (code-char 952))
+ (gethash "iota" entities) (string (code-char 953))
+ (gethash "kappa" entities) (string (code-char 954))
+ (gethash "lambda" entities) (string (code-char 955))
+ (gethash "mu" entities) (string (code-char 956))
+ (gethash "nu" entities) (string (code-char 957))
+ (gethash "xi" entities) (string (code-char 958))
+ (gethash "omicron" entities) (string (code-char 959))
+ (gethash "pi" entities) (string (code-char 960))
+ (gethash "rho" entities) (string (code-char 961))
+ (gethash "sigmaf" entities) (string (code-char 962))
+ (gethash "sigma" entities) (string (code-char 963))
+ (gethash "tau" entities) (string (code-char 964))
+ (gethash "upsilon" entities) (string (code-char 965))
+ (gethash "phi" entities) (string (code-char 966))
+ (gethash "chi" entities) (string (code-char 967))
+ (gethash "psi" entities) (string (code-char 968))
+ (gethash "omega" entities) (string (code-char 969))
+ (gethash "thetasym" entities) (string (code-char 977))
+ (gethash "upsih" entities) (string (code-char 978))
+ (gethash "piv" entities) (string (code-char 982))
+ (gethash "ensp" entities) (string (code-char 8194))
+ (gethash "emsp" entities) (string (code-char 8195))
+ (gethash "thinsp" entities) (string (code-char 8201))
+ (gethash "zwnj" entities) (string (code-char 8204))
+ (gethash "zwj" entities) (string (code-char 8205))
+ (gethash "lrm" entities) (string (code-char 8206))
+ (gethash "rlm" entities) (string (code-char 8207))
+ (gethash "ndash" entities) (string (code-char 8211))
+ (gethash "mdash" entities) (string (code-char 8212))
+ (gethash "lsquo" entities) (string (code-char 8216))
+ (gethash "rsquo" entities) (string (code-char 8217))
+ (gethash "sbquo" entities) (string (code-char 8218))
+ (gethash "ldquo" entities) (string (code-char 8220))
+ (gethash "rdquo" entities) (string (code-char 8221))
+ (gethash "bdquo" entities) (string (code-char 8222))
+ (gethash "dagger" entities) (string (code-char 8224))
+ (gethash "Dagger" entities) (string (code-char 8225))
+ (gethash "bull" entities) (string (code-char 8226))
+ (gethash "hellip" entities) (string (code-char 8230))
+ (gethash "permil" entities) (string (code-char 8240))
+ (gethash "prime" entities) (string (code-char 8242))
+ (gethash "Prime" entities) (string (code-char 8243))
+ (gethash "lsaquo" entities) (string (code-char 8249))
+ (gethash "rsaquo" entities) (string (code-char 8250))
+ (gethash "oline" entities) (string (code-char 8254))
+ (gethash "frasl" entities) (string (code-char 8260))
+ (gethash "euro" entities) (string (code-char 8364))
+ (gethash "image" entities) (string (code-char 8465))
+ (gethash "weierp" entities) (string (code-char 8472))
+ (gethash "real" entities) (string (code-char 8476))
+ (gethash "trade" entities) (string (code-char 8482))
+ (gethash "alefsym" entities) (string (code-char 8501))
+ (gethash "larr" entities) (string (code-char 8592))
+ (gethash "uarr" entities) (string (code-char 8593))
+ (gethash "rarr" entities) (string (code-char 8594))
+ (gethash "darr" entities) (string (code-char 8595))
+ (gethash "harr" entities) (string (code-char 8596))
+ (gethash "crarr" entities) (string (code-char 8629))
+ (gethash "lArr" entities) (string (code-char 8656))
+ (gethash "uArr" entities) (string (code-char 8657))
+ (gethash "rArr" entities) (string (code-char 8658))
+ (gethash "dArr" entities) (string (code-char 8659))
+ (gethash "hArr" entities) (string (code-char 8660))
+ (gethash "forall" entities) (string (code-char 8704))
+ (gethash "part" entities) (string (code-char 8706))
+ (gethash "exist" entities) (string (code-char 8707))
+ (gethash "empty" entities) (string (code-char 8709))
+ (gethash "nabla" entities) (string (code-char 8711))
+ (gethash "isin" entities) (string (code-char 8712))
+ (gethash "notin" entities) (string (code-char 8713))
+ (gethash "ni" entities) (string (code-char 8715))
+ (gethash "prod" entities) (string (code-char 8719))
+ (gethash "sum" entities) (string (code-char 8721))
+ (gethash "minus" entities) (string (code-char 8722))
+ (gethash "lowast" entities) (string (code-char 8727))
+ (gethash "radic" entities) (string (code-char 8730))
+ (gethash "prop" entities) (string (code-char 8733))
+ (gethash "infin" entities) (string (code-char 8734))
+ (gethash "ang" entities) (string (code-char 8736))
+ (gethash "and" entities) (string (code-char 8743))
+ (gethash "or" entities) (string (code-char 8744))
+ (gethash "cap" entities) (string (code-char 8745))
+ (gethash "cup" entities) (string (code-char 8746))
+ (gethash "int" entities) (string (code-char 8747))
+ (gethash "there4" entities) (string (code-char 8756))
+ (gethash "sim" entities) (string (code-char 8764))
+ (gethash "cong" entities) (string (code-char 8773))
+ (gethash "asymp" entities) (string (code-char 8776))
+ (gethash "ne" entities) (string (code-char 8800))
+ (gethash "equiv" entities) (string (code-char 8801))
+ (gethash "le" entities) (string (code-char 8804))
+ (gethash "ge" entities) (string (code-char 8805))
+ (gethash "sub" entities) (string (code-char 8834))
+ (gethash "sup" entities) (string (code-char 8835))
+ (gethash "nsub" entities) (string (code-char 8836))
+ (gethash "sube" entities) (string (code-char 8838))
+ (gethash "supe" entities) (string (code-char 8839))
+ (gethash "oplus" entities) (string (code-char 8853))
+ (gethash "otimes" entities) (string (code-char 8855))
+ (gethash "perp" entities) (string (code-char 8869))
+ (gethash "sdot" entities) (string (code-char 8901))
+ (gethash "lceil" entities) (string (code-char 8968))
+ (gethash "rceil" entities) (string (code-char 8969))
+ (gethash "lfloor" entities) (string (code-char 8970))
+ (gethash "rfloor" entities) (string (code-char 8971))
+ (gethash "lang" entities) (string (code-char 9001))
+ (gethash "rang" entities) (string (code-char 9002))
+ (gethash "loz" entities) (string (code-char 9674))
+ (gethash "spades" entities) (string (code-char 9824))
+ (gethash "clubs" entities) (string (code-char 9827))
+ (gethash "hearts" entities) (string (code-char 9829))
+ (gethash "diams" entities) (string (code-char 9830)))
+ entities))
+
(defun resolve-entity (stream extendable-string entities entity)
"Read and resolve an XML entity from stream, positioned after the '&' entity marker,
accepting &name; &#DEC; and &#xHEX; formats,
@@ -341,7 +603,9 @@
((entities :documentation "A hashtable mapping XML entity names to their replacement stings"
:accessor get-entities
:initarg :entities
- :initform (make-standard-entities))
+ :initform (if *html-compatibility-mode*
+ (make-html-entities)
+ (make-standard-entities)))
(seed :documentation "The user seed object"
:accessor get-seed
:initarg :seed
More information about the s-xml-devel
mailing list