aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v5/db_sqltxt.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sisu/v5/db_sqltxt.rb')
-rw-r--r--lib/sisu/v5/db_sqltxt.rb31
1 files changed, 24 insertions, 7 deletions
diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb
index 99d417e1..ffb5966b 100644
--- a/lib/sisu/v5/db_sqltxt.rb
+++ b/lib/sisu/v5/db_sqltxt.rb
@@ -46,9 +46,6 @@
<http://www.jus.uio.no/sisu>
<http://www.sisudoc.org>
- * Download:
- <http://www.sisudoc.org/sisu/en/SiSU/download.html>
-
* Git
<http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=summary>
<http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=blob;f=lib/sisu/v5/db_sqltxt.rb;hb=HEAD>
@@ -71,7 +68,27 @@ module SiSU_DbText
gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2').
gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')
end
- def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source
+ def clean_searchable_text_from_document_objects(arr)
+ txt_arr,en=[],[]
+ arr=(arr.is_a?(String)) ? [ arr ] : arr
+ arr.each do |s|
+ s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,'').
+ gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,'').
+ gsub(/<br>/m,' ')
+ en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
+ s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,'').
+ gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
+ gsub(/ \s+/m,' ')
+ #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
+ s
+ end
+ txt_arr << arr << en
+ #txt_arr=txt_arr.flatten
+ txt=txt_arr.flatten.join("\n")
+ txt=special_character_escape(txt)
+ txt
+ end
+ def clean_searchable_text_from_document_source(arr)
txt_arr,en=[],[]
arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr
arr.each do |s|
@@ -107,14 +124,14 @@ module SiSU_DbText
gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').
gsub(/^%{1,3} .+/m,''). #removed even if contained in code block
gsub(/<br>/m,' ')
- en << s.scan(/~\{\s*(.+?)\s*\}~/m)
+ #en << s.scan(/~\{\s*(.+?)\s*\}~/m)
s=s.gsub(/~\{.+?\}~/m,'').
gsub(/ \s+/m,' ')
- #special_character_escape(s)
+ ##special_character_escape(s)
+ #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
s
end
txt_arr << arr << en
- #txt_arr=txt_arr.flatten
txt=txt_arr.flatten.join("\n")
txt=special_character_escape(txt)
txt