aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v5/db_sqltxt.rb
diff options
context:
space:
mode:
authorRalph Amissah <ralph@amissah.com>2014-09-28 21:46:45 -0400
committerRalph Amissah <ralph@amissah.com>2014-09-28 22:09:38 -0400
commit33b94cc15e25dedcc6fb93d00942b97823090a4b (patch)
tree5b801f54842151af111e1cffc6a57e1f04ef84c4 /lib/sisu/v5/db_sqltxt.rb
parentv5 v6: manifest, renamed html_manifest (diff)
v5 v6: sql, clean searchable text
* update for (ao/dal) text representation, fix legacy action
Diffstat (limited to 'lib/sisu/v5/db_sqltxt.rb')
-rw-r--r--lib/sisu/v5/db_sqltxt.rb28
1 files changed, 24 insertions, 4 deletions
diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb
index 99d417e1..adb2b0f0 100644
--- a/lib/sisu/v5/db_sqltxt.rb
+++ b/lib/sisu/v5/db_sqltxt.rb
@@ -71,7 +71,27 @@ module SiSU_DbText
gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2').
gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')
end
- def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source
+ def clean_searchable_text_from_document_objects(arr)
+ txt_arr,en=[],[]
+ arr=(arr.is_a?(String)) ? [ arr ] : arr
+ arr.each do |s|
+ s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,'').
+ gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,'').
+ gsub(/<br>/m,' ')
+ en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
+ s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,'').
+ gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
+ gsub(/ \s+/m,' ')
+ #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
+ s
+ end
+ txt_arr << arr << en
+ #txt_arr=txt_arr.flatten
+ txt=txt_arr.flatten.join("\n")
+ txt=special_character_escape(txt)
+ txt
+ end
+ def clean_searchable_text_from_document_source(arr)
txt_arr,en=[],[]
arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr
arr.each do |s|
@@ -107,14 +127,14 @@ module SiSU_DbText
gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').
gsub(/^%{1,3} .+/m,''). #removed even if contained in code block
gsub(/<br>/m,' ')
- en << s.scan(/~\{\s*(.+?)\s*\}~/m)
+ #en << s.scan(/~\{\s*(.+?)\s*\}~/m)
s=s.gsub(/~\{.+?\}~/m,'').
gsub(/ \s+/m,' ')
- #special_character_escape(s)
+ ##special_character_escape(s)
+ #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
s
end
txt_arr << arr << en
- #txt_arr=txt_arr.flatten
txt=txt_arr.flatten.join("\n")
txt=special_character_escape(txt)
txt