From 33b94cc15e25dedcc6fb93d00942b97823090a4b Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Sun, 28 Sep 2014 21:46:45 -0400 Subject: v5 v6: sql, clean searchable text * update for (ao/dal) text representation, fix legacy action --- data/doc/sisu/CHANGELOG_v5 | 3 +++ data/doc/sisu/CHANGELOG_v6 | 3 +++ lib/sisu/v5/db_columns.rb | 2 +- lib/sisu/v5/db_import.rb | 26 +++++++++++++------------- lib/sisu/v5/db_sqltxt.rb | 28 ++++++++++++++++++++++++---- lib/sisu/v6/db_columns.rb | 2 +- lib/sisu/v6/db_import.rb | 26 +++++++++++++------------- lib/sisu/v6/db_sqltxt.rb | 28 ++++++++++++++++++++++++---- 8 files changed, 82 insertions(+), 36 deletions(-) diff --git a/data/doc/sisu/CHANGELOG_v5 b/data/doc/sisu/CHANGELOG_v5 index 2b2a7b89..eed72ecb 100644 --- a/data/doc/sisu/CHANGELOG_v5 +++ b/data/doc/sisu/CHANGELOG_v5 @@ -38,6 +38,9 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_5.6.8.orig.tar.xz sisu_5.6.8.orig.tar.xz sisu_5.6.8-1.dsc +* sql, clean searchable text, update for (ao/dal) text representation, + fix legacy action + %% 5.6.7.orig.tar.xz (2014-09-19:37/5) http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_5.6.7 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_5.6.7-1 diff --git a/data/doc/sisu/CHANGELOG_v6 b/data/doc/sisu/CHANGELOG_v6 index 9771fccf..7350e085 100644 --- a/data/doc/sisu/CHANGELOG_v6 +++ b/data/doc/sisu/CHANGELOG_v6 @@ -28,6 +28,9 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_6.2.9.orig.tar.xz sisu_6.2.9.orig.tar.xz sisu_6.2.9-1.dsc +* sql, clean searchable text, update for (ao/dal) text representation, + fix legacy action + %% 6.2.8.orig.tar.xz (2014-09-19:37/5) http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_6.2.8 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_6.2.8-1 diff --git a/lib/sisu/v5/db_columns.rb b/lib/sisu/v5/db_columns.rb index 44d45e95..15341042 100644 --- a/lib/sisu/v5/db_columns.rb +++ b/lib/sisu/v5/db_columns.rb @@ -81,7 +81,7 @@ module SiSU_DbColumns @sisutxt=special_character_escape(src) else @sisutxt='' end - @fulltext=clean_searchable_text(txt_arr) + @fulltext=clean_searchable_text_from_document_objects(txt_arr) else @sisutxt,@fulltext='','' end end diff --git a/lib/sisu/v5/db_import.rb b/lib/sisu/v5/db_import.rb index a7f33939..8a500f8a 100644 --- a/lib/sisu/v5/db_import.rb +++ b/lib/sisu/v5/db_import.rb @@ -292,7 +292,7 @@ module SiSU_DbImport src=txt_arr.join("\n") src=special_character_escape(src) @tp[:sisutxt_f],@tp[:sisutxt_i]='sisutxt, ',"'#{src}', " - txt=clean_searchable_text(txt_arr) + txt=clean_searchable_text_from_document_source(txt_arr) #txt=special_character_escape(txt) @tp[:fulltxt_f],@tp[:fulltxt_i]='fulltxt, ',"'#{txt}', " end @@ -374,9 +374,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last end if @en_ast[0] then @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last @@ -425,9 +425,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -470,9 +470,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -501,9 +501,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -532,9 +532,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -603,9 +603,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) @tuple_array << t.tuple @en,@en_ast,@en_pls=[],[],[] diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb index 99d417e1..adb2b0f0 100644 --- a/lib/sisu/v5/db_sqltxt.rb +++ b/lib/sisu/v5/db_sqltxt.rb @@ -71,7 +71,27 @@ module SiSU_DbText gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2'). gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1') end - def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source + def clean_searchable_text_from_document_objects(arr) + txt_arr,en=[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,''). + gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,''). + gsub(/
/m,' ') + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,''). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ + s + end + txt_arr << arr << en + #txt_arr=txt_arr.flatten + txt=txt_arr.flatten.join("\n") + txt=special_character_escape(txt) + txt + end + def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr arr.each do |s| @@ -107,14 +127,14 @@ module SiSU_DbText gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,''). gsub(/^%{1,3} .+/m,''). #removed even if contained in code block gsub(/
/m,' ') - en << s.scan(/~\{\s*(.+?)\s*\}~/m) + #en << s.scan(/~\{\s*(.+?)\s*\}~/m) s=s.gsub(/~\{.+?\}~/m,''). gsub(/ \s+/m,' ') - #special_character_escape(s) + ##special_character_escape(s) + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ s end txt_arr << arr << en - #txt_arr=txt_arr.flatten txt=txt_arr.flatten.join("\n") txt=special_character_escape(txt) txt diff --git a/lib/sisu/v6/db_columns.rb b/lib/sisu/v6/db_columns.rb index 343f66e6..005c45b5 100644 --- a/lib/sisu/v6/db_columns.rb +++ b/lib/sisu/v6/db_columns.rb @@ -81,7 +81,7 @@ module SiSU_DbColumns @sisutxt=special_character_escape(src) else @sisutxt='' end - @fulltext=clean_searchable_text(txt_arr) + @fulltext=clean_searchable_text_from_document_objects(txt_arr) else @sisutxt,@fulltext='','' end end diff --git a/lib/sisu/v6/db_import.rb b/lib/sisu/v6/db_import.rb index 3e2e7271..ccffb904 100644 --- a/lib/sisu/v6/db_import.rb +++ b/lib/sisu/v6/db_import.rb @@ -292,7 +292,7 @@ module SiSU_DbImport src=txt_arr.join("\n") src=special_character_escape(src) @tp[:sisutxt_f],@tp[:sisutxt_i]='sisutxt, ',"'#{src}', " - txt=clean_searchable_text(txt_arr) + txt=clean_searchable_text_from_document_source(txt_arr) #txt=special_character_escape(txt) @tp[:fulltxt_f],@tp[:fulltxt_i]='fulltxt, ',"'#{txt}', " end @@ -374,9 +374,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last end if @en_ast[0] then @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last @@ -425,9 +425,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -470,9 +470,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -501,9 +501,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -532,9 +532,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -603,9 +603,9 @@ module SiSU_DbImport @col[:body]=special_character_escape(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) - @col[:plaintext]=clean_searchable_text(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) book_idx=book_idx_hash_to_str(data.idx) - @col[:book_idx]=clean_searchable_text(book_idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) @tuple_array << t.tuple @en,@en_ast,@en_pls=[],[],[] diff --git a/lib/sisu/v6/db_sqltxt.rb b/lib/sisu/v6/db_sqltxt.rb index ac96df38..6d2cbb0a 100644 --- a/lib/sisu/v6/db_sqltxt.rb +++ b/lib/sisu/v6/db_sqltxt.rb @@ -71,7 +71,27 @@ module SiSU_DbText gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2'). gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1') end - def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source + def clean_searchable_text_from_document_objects(arr) + txt_arr,en=[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,''). + gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,''). + gsub(/
/m,' ') + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,''). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ + s + end + txt_arr << arr << en + #txt_arr=txt_arr.flatten + txt=txt_arr.flatten.join("\n") + txt=special_character_escape(txt) + txt + end + def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr arr.each do |s| @@ -107,14 +127,14 @@ module SiSU_DbText gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,''). gsub(/^%{1,3} .+/m,''). #removed even if contained in code block gsub(/
/m,' ') - en << s.scan(/~\{\s*(.+?)\s*\}~/m) + #en << s.scan(/~\{\s*(.+?)\s*\}~/m) s=s.gsub(/~\{.+?\}~/m,''). gsub(/ \s+/m,' ') - #special_character_escape(s) + ##special_character_escape(s) + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ s end txt_arr << arr << en - #txt_arr=txt_arr.flatten txt=txt_arr.flatten.join("\n") txt=special_character_escape(txt) txt -- cgit v1.2.3