diff options
author | Ralph Amissah <ralph@amissah.com> | 2014-09-28 22:20:28 -0400 |
---|---|---|
committer | Ralph Amissah <ralph@amissah.com> | 2014-09-28 22:20:28 -0400 |
commit | 5508dd7359fb25405d820ba6def305a9a3df65b9 (patch) | |
tree | 6edec93bf8770120eeba5a585a740430eb8685d1 /lib/sisu/v5/db_sqltxt.rb | |
parent | debian/changelog (5.6.7-1) (diff) | |
parent | v5 v6: header comment reduction, minor (diff) |
Merge tag 'sisu_5.6.8' into debian/sid
SiSU 5.6.8
Diffstat (limited to 'lib/sisu/v5/db_sqltxt.rb')
-rw-r--r-- | lib/sisu/v5/db_sqltxt.rb | 31 |
1 files changed, 24 insertions, 7 deletions
diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb index 99d417e1..ffb5966b 100644 --- a/lib/sisu/v5/db_sqltxt.rb +++ b/lib/sisu/v5/db_sqltxt.rb @@ -46,9 +46,6 @@ <http://www.jus.uio.no/sisu> <http://www.sisudoc.org> - * Download: - <http://www.sisudoc.org/sisu/en/SiSU/download.html> - * Git <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=summary> <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=blob;f=lib/sisu/v5/db_sqltxt.rb;hb=HEAD> @@ -71,7 +68,27 @@ module SiSU_DbText gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2'). gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1') end - def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source + def clean_searchable_text_from_document_objects(arr) + txt_arr,en=[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,''). + gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,''). + gsub(/<br>/m,' ') + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,''). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ + s + end + txt_arr << arr << en + #txt_arr=txt_arr.flatten + txt=txt_arr.flatten.join("\n") + txt=special_character_escape(txt) + txt + end + def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr arr.each do |s| @@ -107,14 +124,14 @@ module SiSU_DbText gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,''). gsub(/^%{1,3} .+/m,''). #removed even if contained in code block gsub(/<br>/m,' ') - en << s.scan(/~\{\s*(.+?)\s*\}~/m) + #en << s.scan(/~\{\s*(.+?)\s*\}~/m) s=s.gsub(/~\{.+?\}~/m,''). gsub(/ \s+/m,' ') - #special_character_escape(s) + ##special_character_escape(s) + #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ s end txt_arr << arr << en - #txt_arr=txt_arr.flatten txt=txt_arr.flatten.join("\n") txt=special_character_escape(txt) txt |