v5 v6: db, text search & display field, footnotes moved to end of text object
authorRalph Amissah <ralph@amissah.com>
Mon, 20 Oct 2014 01:13:52 +0000 (21:13 -0400)
committerRalph Amissah <ralph@amissah.com>
Mon, 20 Oct 2014 01:13:52 +0000 (21:13 -0400)
* cleaner, more useful search results
* cleaner text search field
* separate footnote fields redundant for search purposes

data/doc/sisu/CHANGELOG_v5
data/doc/sisu/CHANGELOG_v6
lib/sisu/v5/db_import.rb
lib/sisu/v5/db_sqltxt.rb
lib/sisu/v6/db_import.rb
lib/sisu/v6/db_sqltxt.rb

index 0271a63713e081a5d8d1797892d375c476b33c2e..39591639b9e43a300fb7598addfff950f03fa7d0 100644 (file)
@@ -42,6 +42,11 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_5.7.1.orig.tar.xz
   * html_format, "id"s for objects & footnotes
   * remove trailing backslash for empty linebreak & paragraph, <br> <p>
 
+* db, text search & display field, footnotes moved to end of text object
+  * cleaner, more useful search results
+  * cleaner text search field
+  * separate footnote fields redundant for search purposes
+
 %% 5.7.0.orig.tar.xz (2014-10-12:40/7)
 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_5.7.0
 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_5.7.0-1
index 005803bf5bc2d225c3cc567d422f84e429d56b90..6a76425c81ada7b9c743365d63bd6c41baea7db1 100644 (file)
@@ -32,6 +32,11 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_6.3.1.orig.tar.xz
   * html_format, "id"s for objects & footnotes
   * remove trailing backslash for empty linebreak & paragraph, <br> <p>
 
+* db, text search & display field, footnotes moved to end of text object
+  * cleaner, more useful search results
+  * cleaner text search field
+  * separate footnote fields redundant for search purposes
+
 %% 6.3.0.orig.tar.xz (2014-10-12:40/7)
 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_6.3.0
 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_6.3.0-1
index 59cff28a5414e3f0fd6ba30839f81af22da5ccb8..72fb375363e4d7032f1784189ecd0fcce7e7e5e1 100644 (file)
@@ -334,17 +334,17 @@ module SiSU_DbImport
         @en,@en_ast,@en_pls,@tuple_array=[],[],[],[]
         @col[:en_a],@col[:en_z]=nil,nil
         ao_array.each do |data|
-          data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ')
-          data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check
+          data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1').
+            gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1').
+            gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1').
+            gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1').
+            gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1').
+            gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1').
+            gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1').
+            gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1').
+            gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1').
+            gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ').
+            gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check
           @col[:seg]=@@seg
           if data.of ==:para \
           || data.of ==:heading \
@@ -374,7 +374,7 @@ module SiSU_DbImport
               @col[:lid]+=1
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -425,7 +425,7 @@ module SiSU_DbImport
               @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -470,7 +470,7 @@ module SiSU_DbImport
               @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -501,7 +501,7 @@ module SiSU_DbImport
               @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -532,7 +532,7 @@ module SiSU_DbImport
               @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -603,7 +603,7 @@ module SiSU_DbImport
               else
                 SiSU_FormatShared::CSS_Format.new(@md,data).norm
               end
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
index 6585fd66a28e55c53c5407b9776101e87d61b53b..3f6cf95115a9a242176cfd1d7441e4b1377b46c5 100644 (file)
@@ -60,7 +60,7 @@
 module SiSU_DbText
   class Prepare
     def special_character_escape(str)
-      str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")
+      str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")
         gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql
         gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"<br>\n").
         gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check
@@ -80,13 +80,29 @@ module SiSU_DbText
           gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
           gsub(/ \s+/m,' ')
         #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
-        s
+        txt_arr << s
       end
-      txt_arr << arr << en
-      #txt_arr=txt_arr.flatten
+      txt_arr=txt_arr << en
       txt=txt_arr.flatten.join("\n")
-      txt=special_character_escape(txt)
-      txt
+      special_character_escape(txt)
+    end
+    def clean_document_objects_body(arr)
+      txt_arr,en,en_arr=[],[],[]
+      arr=(arr.is_a?(String)) ? [ arr ] : arr
+      arr.each do |s|
+        en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
+        s=s.gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,'<sup>\1</sup>').
+          gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
+          gsub(/ \s+/m,' ')
+        txt_arr << s
+      end
+      en.flatten.each do |e|
+        e=e.sub(/^(\d+)\s*/,'<sup>\1</sup> ')
+        en_arr << e
+      end
+      txt_arr=txt_arr << en_arr
+      txt=txt_arr.flatten.join("\n<br>")
+      special_character_escape(txt)
     end
     def clean_searchable_text_from_document_source(arr)
       txt_arr,en=[],[]
index 9473863da01e323a0c6d1f1e165a8467e73f06b9..5e159451a454952c2987c73aada62bdcdec304cd 100644 (file)
@@ -334,17 +334,17 @@ module SiSU_DbImport
         @en,@en_ast,@en_pls,@tuple_array=[],[],[],[]
         @col[:en_a],@col[:en_z]=nil,nil
         ao_array.each do |data|
-          data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1')
-          data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ')
-          data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check
+          data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1').
+            gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1').
+            gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1').
+            gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1').
+            gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1').
+            gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1').
+            gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1').
+            gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1').
+            gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1').
+            gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ').
+            gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check
           @col[:seg]=@@seg
           if data.of ==:para \
           || data.of ==:heading \
@@ -374,7 +374,7 @@ module SiSU_DbImport
               @col[:lid]+=1
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -425,7 +425,7 @@ module SiSU_DbImport
               @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -470,7 +470,7 @@ module SiSU_DbImport
               @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -501,7 +501,7 @@ module SiSU_DbImport
               @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -532,7 +532,7 @@ module SiSU_DbImport
               @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"
               txt=endnotes(txt).extract_any
               body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
@@ -603,7 +603,7 @@ module SiSU_DbImport
               else
                 SiSU_FormatShared::CSS_Format.new(@md,data).norm
               end
-              @col[:body]=special_character_escape(body)
+              @col[:body]=clean_document_objects_body(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
               @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
index 2fd39fb7b0daafc8198ef1611c98444e0e238b69..2375d5ca945632d2302dd9de2ef4dce21dbec57c 100644 (file)
@@ -60,7 +60,7 @@
 module SiSU_DbText
   class Prepare
     def special_character_escape(str)
-      str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")
+      str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")
         gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql
         gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"<br>\n").
         gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check
@@ -80,13 +80,29 @@ module SiSU_DbText
           gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
           gsub(/ \s+/m,' ')
         #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
-        s
+        txt_arr << s
       end
-      txt_arr << arr << en
-      #txt_arr=txt_arr.flatten
+      txt_arr=txt_arr << en
       txt=txt_arr.flatten.join("\n")
-      txt=special_character_escape(txt)
-      txt
+      special_character_escape(txt)
+    end
+    def clean_document_objects_body(arr)
+      txt_arr,en,en_arr=[],[],[]
+      arr=(arr.is_a?(String)) ? [ arr ] : arr
+      arr.each do |s|
+        en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
+        s=s.gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,'<sup>\1</sup>').
+          gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
+          gsub(/ \s+/m,' ')
+        txt_arr << s
+      end
+      en.flatten.each do |e|
+        e=e.sub(/^(\d+)\s*/,'<sup>\1</sup> ')
+        en_arr << e
+      end
+      txt_arr=txt_arr << en_arr
+      txt=txt_arr.flatten.join("\n<br>")
+      special_character_escape(txt)
     end
     def clean_searchable_text_from_document_source(arr)
       txt_arr,en=[],[]