From 11907e10c73883e5dcdaba11a093ef01c7ee2de8 Mon Sep 17 00:00:00 2001
From: Ralph Amissah <ralph@amissah.com>
Date: Sun, 27 Jan 2013 16:26:29 -0500
Subject: v4: check xml representation of characters (& < > in particular)

---
 lib/sisu/v4/epub_format.rb     |  63 ++++++++++++++----------
 lib/sisu/v4/shared_metadata.rb | 109 +++++------------------------------------
 2 files changed, 47 insertions(+), 125 deletions(-)

(limited to 'lib')

diff --git a/lib/sisu/v4/epub_format.rb b/lib/sisu/v4/epub_format.rb
index 84d32000..dd3273d0 100644
--- a/lib/sisu/v4/epub_format.rb
+++ b/lib/sisu/v4/epub_format.rb
@@ -1217,6 +1217,18 @@ module SiSU_EPUB_Format
       WOK
     end
   end
+  module SanitizeXML
+    def self.xml(x)
+      if x.is_a?(String)
+        x.gsub(/&/,'&amp;').
+          gsub(/</,'&lt;').gsub(/>/,'&gt;').
+          #gsub(/</,'&#60;').gsub(/>/,'&#62;').
+          gsub(/\\\\/,'<br />').
+          gsub(/&lt;br(?: \/)?&gt;/,'<br />')
+      else x
+      end
+    end
+  end
   class HeadInformation
     include SiSU_Viz
     attr_reader :md,:rdf,:vz
@@ -1347,10 +1359,12 @@ output_epub_cont_seg.close
       end
       def head
         depth=@md.lvs[1] + @md.lvs[2] + @md.lvs[3] + @md.lvs[4]
+        title=SanitizeXML.xml(@md.title.full)
+        author=SanitizeXML.xml(@md.author)
         <<-WOK
     <!-- four required metadata items (for all NCX documents,
       (including the relaxed constraints of OPS 2.0) -->
-    <title>#{@md.title.full} by #{@md.author}</title>
+    <title>#{title} by #{author}</title>
     <link href="css/xhtml.css" rel="stylesheet" type="text/css" id="main-css" />
     <meta name="dtb:uid" content="urn:uuid:#{@md.dgst[1]}" />
     <!-- <meta name="epub-creator" content="#{@md.publisher}" /> -->
@@ -1365,16 +1379,18 @@ output_epub_cont_seg.close
         WOK
       end
       def doc_title
+        txt=SanitizeXML.xml(@md.title.full)
         <<-WOK
   <docTitle>
-    <text>#{@md.title.full}</text>
+    <text>#{txt}</text>
   </docTitle>
         WOK
       end
       def doc_author
+        txt=SanitizeXML.xml(@md.author)
         <<-WOK
   <docAuthor>
-    <text>#{@md.author}</text>
+    <text>#{txt}</text>
   </docAuthor>
         WOK
       end
@@ -1466,12 +1482,10 @@ output_epub_cont_seg.close
             m=(m.empty?) \
             ? (surname + other_names)
             : (m + '; ' + surname + ', ' + other_names)
-            m=m.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-              gsub(/&lt;br(?: \/)?&gt;/,';')
+            m=SanitizeXML.xml(m)
           end
           x=@md.creator.author.dup
-          x=x.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-            gsub(/&lt;br(?: \/)?&gt;/,'<br />')
+          x=SanitizeXML.xml(x)
           %{\n    <dc:creator opf:file-as="#{m}" opf:role="aut">#{x}</dc:creator>}
         else ''
         end
@@ -1488,12 +1502,10 @@ output_epub_cont_seg.close
             m=(m.empty?) \
             ? (surname + other_names)
             : (m + '; ' + surname + ', ' + other_names)
-            m=m.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-              gsub(/&lt;br(?: \/)?&gt;/,';')
+            m=SanitizeXML.xml(m)
           end
           x=@md.creator.editor.dup
-          x=x.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-            gsub(/&lt;br(?: \/)?&gt;/,'<br />')
+          x=SanitizeXML.xml(x)
           %{\n    <dc:creator opf:file-as="#{m}" opf:role="edt">#{x}</dc:creator>}
         else ''
         end
@@ -1510,12 +1522,10 @@ output_epub_cont_seg.close
             m=(m.empty?) \
             ? (surname + other_names)
             : (m + '; ' + surname + ', ' + other_names)
-            m=m.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-              gsub(/&lt;br(?: \/)?&gt;/,';')
+            m=SanitizeXML.xml(m)
           end
           x=@md.creator.translator.dup
-          x=x.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-            gsub(/&lt;br(?: \/)?&gt;/,'<br />')
+          x=SanitizeXML.xml(x)
           %{\n    <dc:creator opf:file-as="#{m}" opf:role="trl">#{x}</dc:creator>}
         else ''
         end
@@ -1532,28 +1542,24 @@ output_epub_cont_seg.close
             m=(m.empty?) \
             ? (surname + other_names)
             : (m + '; ' + surname + ', ' + other_names)
-            m=m.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-              gsub(/&lt;br(?: \/)?&gt;/,';')
+            m=SanitizeXML.xml(m)
           end
           x=@md.creator.illustrator.dup
-          x=x.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-            gsub(/&lt;br(?: \/)?&gt;/,'<br />')
+          x=SanitizeXML.xml(x)
           %{\n    <dc:creator opf:file-as="#{m}" opf:role="ill">#{x}</dc:creator>}
         else ''
         end
         date_published=if defined? @md.date.published \
         and @md.date.published =~/\S+/
           x=@md.date.published.dup
-          x=x.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-            gsub(/&lt;br(?: \/)?&gt;/,'<br />')
+          x=SanitizeXML.xml(x)
           %{\n    <dc:date opf:event="published">#{x}</dc:date>}
         else ''
         end
         subject=if defined? @md.classify.subject \
         and @md.classify.subject =~/\S+/
           x=@md.classify.subject.dup
-          x=x.gsub(/</,'&lt;').gsub(/>/,'&gt;').
-            gsub(/&lt;br(?: \/)?&gt;/,'<br />')
+          x=SanitizeXML.xml(x)
           %{\n    <dc:subject>#{x}</dc:subject>}
         else ''
         end
@@ -1565,7 +1571,7 @@ output_epub_cont_seg.close
         end
         rights=if defined? @md.rights.all \
         and @md.rights.all =~/\S+/
-          rights=@md.rights.all.gsub(/<br>/,'<br />')
+          rights=SanitizeXML.xml(@md.rights.all)
           %{\n    <dc:rights>#{rights}</dc:rights>}
         else ''
         end
@@ -1750,9 +1756,8 @@ output_epub_cont_seg.close
     end
     def rights
       def all
-        rghts=@md.rights.all.gsub(/<br>/,'<br />')
-        rghts=rghts.gsub(/^\s*Copyright\s+\(C\)/,'Copyright <sup>&copy;</sup>&nbsp;')
-        %{<p class="small_left">Rights: #{rghts}</p>}
+        rights=SanitizeXML.xml(@md.rights.all)
+        %{<p class="small_left">Rights: #{rights}</p>}
       end
       self
     end
@@ -2068,15 +2073,19 @@ output_epub_cont_seg.close
 #{@vz.table_close}}
     end
     def toc_head_copy_at
+      @txt=SanitizeXML.xml(@txt)
       %{<p class="center">#{@txt}</p>\n}
     end
     def center
+      @txt=SanitizeXML.xml(@txt)
       %{<p class="center">#{@txt}</p>\n}
     end
     def bold
+      @txt=SanitizeXML.xml(@txt)
       %{<p class="bold">#{@txt}</p>\n}
     end
     def center_bold
+      @txt=SanitizeXML.xml(@txt)
       %{<p class="centerbold">#{@txt}</p>\n}
     end
   end
diff --git a/lib/sisu/v4/shared_metadata.rb b/lib/sisu/v4/shared_metadata.rb
index 44c7243e..8b660208 100644
--- a/lib/sisu/v4/shared_metadata.rb
+++ b/lib/sisu/v4/shared_metadata.rb
@@ -725,107 +725,19 @@ module SiSU_Metadata
     end
     def char_enc(str)
       @s=str
+      def amp
+        if @s \
+        and @s.is_a?(String)
+          @s=@s.gsub(/&/u,'&amp;')
+        end
+        @s
+      end
       def utf8
         if @s \
         and @s.is_a?(String)
           @s=@s.gsub(/<br(?: \/)?>/u,Mx[:br_paragraph]).
-            gsub(/</um,'&#60;'). # '&lt;'   # &#060;
-            gsub(/</um,'&#60;'). # '&lt;'   # &#060;
-            gsub(/>/um,'&#62;'). # '&gt;'   # &#062;
-            gsub(/¢/um,'&#162;'). # '&cent;'   # &#162;
-            gsub(/£/um,'&#163;'). # '&pound;'  # &#163;
-            gsub(/¥/um,'&#165;'). # '&yen;'    # &#165;
-            gsub(/§/um,'&#167;'). # '&sect;'   # &#167;
-            gsub(/©/um,'&#169;'). # '&copy;'   # &#169;
-            gsub(/ª/um,'&#170;'). # '&ordf;'   # &#170;
-            gsub(/«/um,'&#171;'). # '&laquo;'  # &#171;
-            gsub(/®/um,'&#174;'). # '&reg;'    # &#174;
-            gsub(/°/um,'&#176;'). # '&deg;'    # &#176;
-            gsub(/±/um,'&#177;'). # '&plusmn;' # &#177;
-            gsub(/²/um,'&#178;'). # '&sup2;'   # &#178;
-            gsub(/³/um,'&#179;'). # '&sup3;'   # &#179;
-            gsub(/µ/um,'&#181;'). # '&micro;'  # &#181;
-            gsub(/¶/um,'&#182;'). # '&para;'   # &#182;
-            gsub(/¹/um,'&#185;'). # '&sup1;'   # &#185;
-            gsub(/º/um,'&#186;'). # '&ordm;'   # &#186;
-            gsub(/»/um,'&#187;'). # '&raquo;'  # &#187;
-            gsub(/¼/um,'&#188;'). # '&frac14;' # &#188;
-            gsub(/½/um,'&#189;'). # '&frac12;' # &#189;
-            gsub(/¾/um,'&#190;'). # '&frac34;' # &#190;
-            gsub(/×/um,'&#215;'). # '&times;'  # &#215;
-            gsub(/÷/um,'&#247;'). # '&divide;' # &#247;
-            gsub(/¿/um,'&#191;'). # '&iquest;' # &#191;
-            gsub(/À/um,'&#192;'). # '&Agrave;' # &#192;
-            gsub(/Á/um,'&#193;'). # '&Aacute;' # &#193;
-            gsub(/Â/um,'&#194;'). # '&Acirc;'  # &#194;
-            gsub(/Ã/um,'&#195;'). # '&Atilde;' # &#195;
-            gsub(/Ä/um,'&#196;'). # '&Auml;'   # &#196;
-            gsub(/Å/um,'&#197;'). # '&Aring;'  # &#197;
-            gsub(/Æ/um,'&#198;'). # '&AElig;'  # &#198;
-            gsub(/Ç/um,'&#199;'). # '&Ccedil;' # &#199;
-            gsub(/È/um,'&#200;'). # '&Egrave;' # &#200;
-            gsub(/É/um,'&#201;'). # '&Eacute;' # &#201;
-            gsub(/Ê/um,'&#202;'). # '&Ecirc;'  # &#202;
-            gsub(/Ë/um,'&#203;'). # '&Euml;'   # &#203;
-            gsub(/Ì/um,'&#204;'). # '&Igrave;' # &#204;
-            gsub(/Í/um,'&#205;'). # '&Iacute;' # &#205;
-            gsub(/Î/um,'&#206;'). # '&Icirc;'  # &#206;
-            gsub(/Ï/um,'&#207;'). # '&Iuml;'   # &#207;
-            gsub(/Ð/um,'&#208;'). # '&ETH;'    # &#208;
-            gsub(/Ñ/um,'&#209;'). # '&Ntilde;' # &#209;
-            gsub(/Ò/um,'&#210;'). # '&Ograve;' # &#210;
-            gsub(/Ó/um,'&#211;'). # '&Oacute;' # &#211;
-            gsub(/Ô/um,'&#212;'). # '&Ocirc;'  # &#212;
-            gsub(/Õ/um,'&#213;'). # '&Otilde;' # &#213;
-            gsub(/Ö/um,'&#214;'). # '&Ouml;'   # &#214;
-            gsub(/Ø/um,'&#216;'). # '&Oslash;' # &#216;
-            gsub(/Ù/um,'&#217;'). # '&Ugrave;' # &#217;
-            gsub(/Ú/um,'&#218;'). # '&Uacute;' # &#218;
-            gsub(/Û/um,'&#219;'). # '&Ucirc;'  # &#219;
-            gsub(/Ü/um,'&#220;'). # '&Uuml;'   # &#220;
-            gsub(/Ý/um,'&#221;'). # '&Yacute;' # &#221;
-            gsub(/Þ/um,'&#222;'). # '&THORN;'  # &#222;
-            gsub(/ß/um,'&#223;'). # '&szlig;'  # &#223;
-            gsub(/à/um,'&#224;'). # '&agrave;' # &#224;
-            gsub(/á/um,'&#225;'). # '&aacute;' # &#225;
-            gsub(/â/um,'&#226;'). # '&acirc;'  # &#226;
-            gsub(/ã/um,'&#227;'). # '&atilde;' # &#227;
-            gsub(/ä/um,'&#228;'). # '&auml;'   # &#228;
-            gsub(/å/um,'&#229;'). # '&aring;'  # &#229;
-            gsub(/æ/um,'&#230;'). # '&aelig;'  # &#230;
-            gsub(/ç/um,'&#231;'). # '&ccedil;' # &#231;
-            gsub(/è/um,'&#232;'). # '&egrave;' # &#232;
-            gsub(/é/um,'&#233;'). # '&acute;'  # &#233;
-            gsub(/ê/um,'&#234;'). # '&circ;'   # &#234;
-            gsub(/ë/um,'&#235;'). # '&euml;'   # &#235;
-            gsub(/ì/um,'&#236;'). # '&igrave;' # &#236;
-            gsub(/í/um,'&#237;'). # '&acute;'  # &#237;
-            gsub(/î/um,'&#238;'). # '&icirc;'  # &#238;
-            gsub(/ï/um,'&#239;'). # '&iuml;'   # &#239;
-            gsub(/ð/um,'&#240;'). # '&eth;'    # &#240;
-            gsub(/ñ/um,'&#241;'). # '&ntilde;' # &#241;
-            gsub(/ò/um,'&#242;'). # '&ograve;' # &#242;
-            gsub(/ó/um,'&#243;'). # '&oacute;' # &#243;
-            gsub(/ô/um,'&#244;'). # '&ocirc;'  # &#244;
-            gsub(/õ/um,'&#245;'). # '&otilde;' # &#245;
-            gsub(/ö/um,'&#246;'). # '&ouml;'   # &#246;
-            gsub(/ø/um,'&#248;'). # '&oslash;' # &#248;
-            gsub(/ù/um,'&#250;'). # '&ugrave;' # &#250;
-            gsub(/ú/um,'&#251;'). # '&uacute;' # &#251;
-            gsub(/û/um,'&#252;'). # '&ucirc;'  # &#252;
-            gsub(/ü/um,'&#253;'). # '&uuml;'   # &#253;
-            gsub(/þ/um,'&#254;'). # '&thorn;'  # &#254;
-            gsub(/ÿ/um,'&#255;'). # '&yuml;'   # &#255;
-            gsub(/‘/um,'&#8216;'). # '&lsquo;' # &#8216;
-            gsub(/’/um,'&#8217;'). # '&rsquo;' # &#8217;
-            gsub(/“/um,'&#8220;'). # &ldquo;   # &#8220;
-            gsub(/”/um,'&#8221;'). # &rdquo;   # &#8221;
-            gsub(/–/um,'&#8211;'). # &ndash;   # &#8211;
-            gsub(/—/um,'&#8212;'). # &mdash;   # &#8212;
-            gsub(/∝/um,'&#8733;'). # &prop;    # &#8733;
-            gsub(/∞/um,'&#8734;'). # &infin;   # &#8734;
-            gsub(/™/um,'&#8482;'). # &trade;   # &#8482;
-            gsub(/✠/um,'&#10016;'). # &cross;  # &#10016;
+            gsub(/</um,'&lt;').gsub(/>/um,'&gt;').
+            #gsub(/</um,'&#60;').gsub(/>/um,'&#62;').
             gsub(/ /um,' ').       # space identify
             gsub(/ /um,' ').       # space identify
             gsub(/#{Mx[:br_paragraph]}/u,'<br />')
@@ -888,7 +800,8 @@ WOK
     end
     def xhtml_display
       def meta_para
-        inf_xml=char_enc(@inf).utf8
+        inf_xml=char_enc(@inf).amp
+        inf_xml=char_enc(inf_xml).utf8
         %{<p class="norm">
   <b>#{@tag}</b>: #{inf_xml}
 </p>}
-- 
cgit v1.2.3