diff options
author | Ralph Amissah <ralph@amissah.com> | 2014-09-28 21:46:45 -0400 |
---|---|---|
committer | Ralph Amissah <ralph@amissah.com> | 2014-09-28 22:09:38 -0400 |
commit | 33b94cc15e25dedcc6fb93d00942b97823090a4b (patch) | |
tree | 5b801f54842151af111e1cffc6a57e1f04ef84c4 /lib/sisu/v6/db_sqltxt.rb | |
parent | v5 v6: manifest, renamed html_manifest (diff) |
v5 v6: sql, clean searchable text
* update for (ao/dal) text representation, fix legacy action
Diffstat (limited to 'lib/sisu/v6/db_sqltxt.rb')
-rw-r--r-- | lib/sisu/v6/db_sqltxt.rb | 28 |
1 files changed, 24 insertions, 4 deletions
diff --git a/lib/sisu/v6/db_sqltxt.rb b/lib/sisu/v6/db_sqltxt.rb index ac96df38..6d2cbb0a 100644 --- a/lib/sisu/v6/db_sqltxt.rb +++ b/lib/sisu/v6/db_sqltxt.rb @@ -71,7 +71,27 @@ module SiSU_DbText gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2'). gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1') end - def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source + def clean_searchable_text_from_document_objects(arr) + txt_arr,en=[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,''). + gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,''). + gsub(/<br>/m,' ') + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,''). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ + s + end + txt_arr << arr << en + #txt_arr=txt_arr.flatten + txt=txt_arr.flatten.join("\n") + txt=special_character_escape(txt) + txt + end + def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr arr.each do |s| @@ -107,14 +127,14 @@ module SiSU_DbText gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,''). gsub(/^%{1,3} .+/m,''). #removed even if contained in code block gsub(/<br>/m,' ') - en << s.scan(/~\{\s*(.+?)\s*\}~/m) + #en << s.scan(/~\{\s*(.+?)\s*\}~/m) s=s.gsub(/~\{.+?\}~/m,''). gsub(/ \s+/m,' ') - #special_character_escape(s) + ##special_character_escape(s) + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ s end txt_arr << arr << en - #txt_arr=txt_arr.flatten txt=txt_arr.flatten.join("\n") txt=special_character_escape(txt) txt |