v5 v6: sql, clean searchable text
authorRalph Amissah <ralph@amissah.com>
Mon, 29 Sep 2014 01:46:45 +0000 (21:46 -0400)
committerRalph Amissah <ralph@amissah.com>
Mon, 29 Sep 2014 02:09:38 +0000 (22:09 -0400)
* update for (ao/dal) text representation, fix legacy action

data/doc/sisu/CHANGELOG_v5
data/doc/sisu/CHANGELOG_v6
lib/sisu/v5/db_columns.rb
lib/sisu/v5/db_import.rb
lib/sisu/v5/db_sqltxt.rb
lib/sisu/v6/db_columns.rb
lib/sisu/v6/db_import.rb
lib/sisu/v6/db_sqltxt.rb

index 2b2a7b89691ea8986d8dbf1fefcd12d12283911e..eed72ecb4c6846d621420d7670354acb80a2fb31 100644 (file)
@@ -38,6 +38,9 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_5.6.8.orig.tar.xz
   sisu_5.6.8.orig.tar.xz
   sisu_5.6.8-1.dsc
 
+* sql, clean searchable text, update for (ao/dal) text representation,
+  fix legacy action
+
 %% 5.6.7.orig.tar.xz (2014-09-19:37/5)
 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_5.6.7
 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_5.6.7-1
index 9771fccf2d67631f112cfd4b26c20aa1680e695c..7350e08574b10c990d49507941c5ed0c6cefbfdd 100644 (file)
@@ -28,6 +28,9 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_6.2.9.orig.tar.xz
   sisu_6.2.9.orig.tar.xz
   sisu_6.2.9-1.dsc
 
+* sql, clean searchable text, update for (ao/dal) text representation,
+  fix legacy action
+
 %% 6.2.8.orig.tar.xz (2014-09-19:37/5)
 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_6.2.8
 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_6.2.8-1
index 44d45e9550c1eb1d5358f9ee77b56791a3ba7226..1534104276dfa6de632acbbbc48dbb06384e1b3d 100644 (file)
@@ -81,7 +81,7 @@ module SiSU_DbColumns
           @sisutxt=special_character_escape(src)
         else @sisutxt=''
         end
-        @fulltext=clean_searchable_text(txt_arr)
+        @fulltext=clean_searchable_text_from_document_objects(txt_arr)
        else @sisutxt,@fulltext='',''
       end
     end
index a7f3393985a5b3f59523203723f1f309b2d33413..8a500f8af6ba6010cb9e88736bfa87db9fa20184 100644 (file)
@@ -292,7 +292,7 @@ module SiSU_DbImport
         src=txt_arr.join("\n")
         src=special_character_escape(src)
         @tp[:sisutxt_f],@tp[:sisutxt_i]='sisutxt, ',"'#{src}', "
-        txt=clean_searchable_text(txt_arr)
+        txt=clean_searchable_text_from_document_source(txt_arr)
         #txt=special_character_escape(txt)
         @tp[:fulltxt_f],@tp[:fulltxt_i]='fulltxt, ',"'#{txt}', "
       end
@@ -374,9 +374,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last
               end
               if @en_ast[0] then @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last
@@ -425,9 +425,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               @en_a,@en_z=@en[0].first,@en[0].last if @en[0]
               @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
               @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -470,9 +470,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               @en_a,@en_z=@en[0].first,@en[0].last if @en[0]
               @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
               @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -501,9 +501,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               @en_a,@en_z=@en[0].first,@en[0].last if @en[0]
               @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
               @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -532,9 +532,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               @en_a,@en_z=@en[0].first,@en[0].last if @en[0]
               @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
               @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -603,9 +603,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint)
               @tuple_array << t.tuple
               @en,@en_ast,@en_pls=[],[],[]
index 99d417e15e8fe7149ec262b97852826e399b1df3..adb2b0f06dee7476cce42064ca316fdaab312939 100644 (file)
@@ -71,7 +71,27 @@ module SiSU_DbText
         gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2').
         gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')
     end
-    def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source
+    def clean_searchable_text_from_document_objects(arr)
+      txt_arr,en=[],[]
+      arr=(arr.is_a?(String)) ? [ arr ] : arr
+      arr.each do |s|
+        s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,'').
+            gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,'').
+            gsub(/<br>/m,' ')
+        en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
+        s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,'').
+          gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
+          gsub(/ \s+/m,' ')
+        #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
+        s
+      end
+      txt_arr << arr << en
+      #txt_arr=txt_arr.flatten
+      txt=txt_arr.flatten.join("\n")
+      txt=special_character_escape(txt)
+      txt
+    end
+    def clean_searchable_text_from_document_source(arr)
       txt_arr,en=[],[]
       arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr
       arr.each do |s|
@@ -107,14 +127,14 @@ module SiSU_DbText
           gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').
           gsub(/^%{1,3} .+/m,''). #removed even if contained in code block
           gsub(/<br>/m,' ')
-        en << s.scan(/~\{\s*(.+?)\s*\}~/m)
+        #en << s.scan(/~\{\s*(.+?)\s*\}~/m)
         s=s.gsub(/~\{.+?\}~/m,'').
           gsub(/ \s+/m,' ')
-        #special_character_escape(s)
+        ##special_character_escape(s)
+        #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
         s
       end
       txt_arr << arr << en
-      #txt_arr=txt_arr.flatten
       txt=txt_arr.flatten.join("\n")
       txt=special_character_escape(txt)
       txt
index 343f66e6dc574b006c6eeddc544a52365fae4d0f..005c45b5ee377425a4f2c3abe6bf36059e6f4a0e 100644 (file)
@@ -81,7 +81,7 @@ module SiSU_DbColumns
           @sisutxt=special_character_escape(src)
         else @sisutxt=''
         end
-        @fulltext=clean_searchable_text(txt_arr)
+        @fulltext=clean_searchable_text_from_document_objects(txt_arr)
        else @sisutxt,@fulltext='',''
       end
     end
index 3e2e72716456e9f0ca6a3bc3146be9353517e456..ccffb904d78ede2aba679f21788ed4a3c1b877a3 100644 (file)
@@ -292,7 +292,7 @@ module SiSU_DbImport
         src=txt_arr.join("\n")
         src=special_character_escape(src)
         @tp[:sisutxt_f],@tp[:sisutxt_i]='sisutxt, ',"'#{src}', "
-        txt=clean_searchable_text(txt_arr)
+        txt=clean_searchable_text_from_document_source(txt_arr)
         #txt=special_character_escape(txt)
         @tp[:fulltxt_f],@tp[:fulltxt_i]='fulltxt, ',"'#{txt}', "
       end
@@ -374,9 +374,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last
               end
               if @en_ast[0] then @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last
@@ -425,9 +425,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               @en_a,@en_z=@en[0].first,@en[0].last if @en[0]
               @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
               @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -470,9 +470,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               @en_a,@en_z=@en[0].first,@en[0].last if @en[0]
               @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
               @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -501,9 +501,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               @en_a,@en_z=@en[0].first,@en[0].last if @en[0]
               @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
               @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -532,9 +532,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               @en_a,@en_z=@en[0].first,@en[0].last if @en[0]
               @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
               @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -603,9 +603,9 @@ module SiSU_DbImport
               @col[:body]=special_character_escape(body)
               plaintext=@col[:body].dup
               plaintext=strip_markup(plaintext)
-              @col[:plaintext]=clean_searchable_text(plaintext)
+              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)
               book_idx=book_idx_hash_to_str(data.idx)
-              @col[:book_idx]=clean_searchable_text(book_idx)
+              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)
               t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint)
               @tuple_array << t.tuple
               @en,@en_ast,@en_pls=[],[],[]
index ac96df383ccd82934f25c080ba4d30c4cf5f1b8d..6d2cbb0a8b14cb13981fca51e4a8fec53a2fbd5a 100644 (file)
@@ -71,7 +71,27 @@ module SiSU_DbText
         gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2').
         gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')
     end
-    def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source
+    def clean_searchable_text_from_document_objects(arr)
+      txt_arr,en=[],[]
+      arr=(arr.is_a?(String)) ? [ arr ] : arr
+      arr.each do |s|
+        s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,'').
+            gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,'').
+            gsub(/<br>/m,' ')
+        en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
+        s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,'').
+          gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
+          gsub(/ \s+/m,' ')
+        #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
+        s
+      end
+      txt_arr << arr << en
+      #txt_arr=txt_arr.flatten
+      txt=txt_arr.flatten.join("\n")
+      txt=special_character_escape(txt)
+      txt
+    end
+    def clean_searchable_text_from_document_source(arr)
       txt_arr,en=[],[]
       arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr
       arr.each do |s|
@@ -107,14 +127,14 @@ module SiSU_DbText
           gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').
           gsub(/^%{1,3} .+/m,''). #removed even if contained in code block
           gsub(/<br>/m,' ')
-        en << s.scan(/~\{\s*(.+?)\s*\}~/m)
+        #en << s.scan(/~\{\s*(.+?)\s*\}~/m)
         s=s.gsub(/~\{.+?\}~/m,'').
           gsub(/ \s+/m,' ')
-        #special_character_escape(s)
+        ##special_character_escape(s)
+        #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
         s
       end
       txt_arr << arr << en
-      #txt_arr=txt_arr.flatten
       txt=txt_arr.flatten.join("\n")
       txt=special_character_escape(txt)
       txt