From 33b94cc15e25dedcc6fb93d00942b97823090a4b Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Sun, 28 Sep 2014 21:46:45 -0400 Subject: v5 v6: sql, clean searchable text * update for (ao/dal) text representation, fix legacy action --- lib/sisu/v5/db_sqltxt.rb | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'lib/sisu/v5/db_sqltxt.rb') diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb index 99d417e1..adb2b0f0 100644 --- a/lib/sisu/v5/db_sqltxt.rb +++ b/lib/sisu/v5/db_sqltxt.rb @@ -71,7 +71,27 @@ module SiSU_DbText gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2'). gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1') end - def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source + def clean_searchable_text_from_document_objects(arr) + txt_arr,en=[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,''). + gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,''). + gsub(/
/m,' ') + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,''). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ + s + end + txt_arr << arr << en + #txt_arr=txt_arr.flatten + txt=txt_arr.flatten.join("\n") + txt=special_character_escape(txt) + txt + end + def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr arr.each do |s| @@ -107,14 +127,14 @@ module SiSU_DbText gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,''). gsub(/^%{1,3} .+/m,''). #removed even if contained in code block gsub(/
/m,' ') - en << s.scan(/~\{\s*(.+?)\s*\}~/m) + #en << s.scan(/~\{\s*(.+?)\s*\}~/m) s=s.gsub(/~\{.+?\}~/m,''). gsub(/ \s+/m,' ') - #special_character_escape(s) + ##special_character_escape(s) + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ s end txt_arr << arr << en - #txt_arr=txt_arr.flatten txt=txt_arr.flatten.join("\n") txt=special_character_escape(txt) txt -- cgit v1.2.3 From ca70dde521c90150f8c36ecf83f05c2d952d1712 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Sun, 28 Sep 2014 22:07:48 -0400 Subject: v5 v6: header comment reduction, minor --- lib/sisu/v5/db_sqltxt.rb | 3 --- 1 file changed, 3 deletions(-) (limited to 'lib/sisu/v5/db_sqltxt.rb') diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb index adb2b0f0..ffb5966b 100644 --- a/lib/sisu/v5/db_sqltxt.rb +++ b/lib/sisu/v5/db_sqltxt.rb @@ -46,9 +46,6 @@ - * Download: - - * Git -- cgit v1.2.3