From 2c73f3060f9678f751c236fe17863d443f6a650f Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Sun, 19 Oct 2014 21:13:52 -0400 Subject: v5 v6: db, text search & display field, footnotes moved to end of text object * cleaner, more useful search results * cleaner text search field * separate footnote fields redundant for search purposes --- lib/sisu/v5/db_import.rb | 34 +++++++++++++++++----------------- lib/sisu/v5/db_sqltxt.rb | 28 ++++++++++++++++++++++------ lib/sisu/v6/db_import.rb | 34 +++++++++++++++++----------------- lib/sisu/v6/db_sqltxt.rb | 28 ++++++++++++++++++++++------ 4 files changed, 78 insertions(+), 46 deletions(-) (limited to 'lib') diff --git a/lib/sisu/v5/db_import.rb b/lib/sisu/v5/db_import.rb index 59cff28a..72fb3753 100644 --- a/lib/sisu/v5/db_import.rb +++ b/lib/sisu/v5/db_import.rb @@ -334,17 +334,17 @@ module SiSU_DbImport @en,@en_ast,@en_pls,@tuple_array=[],[],[],[] @col[:en_a],@col[:en_z]=nil,nil ao_array.each do |data| - data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1') - data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ') - data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check + data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1'). + gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1'). + gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1'). + gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1'). + gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1'). + gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1'). + gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1'). + gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1'). + gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1'). + gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 '). + gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check @col[:seg]=@@seg if data.of ==:para \ || data.of ==:heading \ @@ -374,7 +374,7 @@ module SiSU_DbImport @col[:lid]+=1 txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -425,7 +425,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -470,7 +470,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -501,7 +501,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -532,7 +532,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -603,7 +603,7 @@ module SiSU_DbImport else SiSU_FormatShared::CSS_Format.new(@md,data).norm end - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb index 6585fd66..3f6cf951 100644 --- a/lib/sisu/v5/db_sqltxt.rb +++ b/lib/sisu/v5/db_sqltxt.rb @@ -60,7 +60,7 @@ module SiSU_DbText class Prepare def special_character_escape(str) - str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") + str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"
\n"). gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check @@ -80,13 +80,29 @@ module SiSU_DbText gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). gsub(/ \s+/m,' ') #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ - s + txt_arr << s end - txt_arr << arr << en - #txt_arr=txt_arr.flatten + txt_arr=txt_arr << en txt=txt_arr.flatten.join("\n") - txt=special_character_escape(txt) - txt + special_character_escape(txt) + end + def clean_document_objects_body(arr) + txt_arr,en,en_arr=[],[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,'\1'). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + txt_arr << s + end + en.flatten.each do |e| + e=e.sub(/^(\d+)\s*/,'\1 ') + en_arr << e + end + txt_arr=txt_arr << en_arr + txt=txt_arr.flatten.join("\n
") + special_character_escape(txt) end def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] diff --git a/lib/sisu/v6/db_import.rb b/lib/sisu/v6/db_import.rb index 9473863d..5e159451 100644 --- a/lib/sisu/v6/db_import.rb +++ b/lib/sisu/v6/db_import.rb @@ -334,17 +334,17 @@ module SiSU_DbImport @en,@en_ast,@en_pls,@tuple_array=[],[],[],[] @col[:en_a],@col[:en_z]=nil,nil ao_array.each do |data| - data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1') - data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ') - data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check + data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1'). + gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1'). + gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1'). + gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1'). + gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1'). + gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1'). + gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1'). + gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1'). + gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1'). + gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 '). + gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check @col[:seg]=@@seg if data.of ==:para \ || data.of ==:heading \ @@ -374,7 +374,7 @@ module SiSU_DbImport @col[:lid]+=1 txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -425,7 +425,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -470,7 +470,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -501,7 +501,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -532,7 +532,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -603,7 +603,7 @@ module SiSU_DbImport else SiSU_FormatShared::CSS_Format.new(@md,data).norm end - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) diff --git a/lib/sisu/v6/db_sqltxt.rb b/lib/sisu/v6/db_sqltxt.rb index 2fd39fb7..2375d5ca 100644 --- a/lib/sisu/v6/db_sqltxt.rb +++ b/lib/sisu/v6/db_sqltxt.rb @@ -60,7 +60,7 @@ module SiSU_DbText class Prepare def special_character_escape(str) - str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") + str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"
\n"). gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check @@ -80,13 +80,29 @@ module SiSU_DbText gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). gsub(/ \s+/m,' ') #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ - s + txt_arr << s end - txt_arr << arr << en - #txt_arr=txt_arr.flatten + txt_arr=txt_arr << en txt=txt_arr.flatten.join("\n") - txt=special_character_escape(txt) - txt + special_character_escape(txt) + end + def clean_document_objects_body(arr) + txt_arr,en,en_arr=[],[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,'\1'). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + txt_arr << s + end + en.flatten.each do |e| + e=e.sub(/^(\d+)\s*/,'\1 ') + en_arr << e + end + txt_arr=txt_arr << en_arr + txt=txt_arr.flatten.join("\n
") + special_character_escape(txt) end def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] -- cgit v1.2.3