From 63c5a3cead1fb5cbd9b1bff653f269dce8d8052c Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Tue, 20 Apr 2010 19:01:55 -0400 Subject: db name, tables, columns, indexes changes, review (need another version bump 2.2.0) * db (sql) table structure, further review and changes (hence breakage & version bump) * new pgsql db name prefix "sisu_v2b_" * new table column words in doc_objects & endnotes, VARCHAR 3000 to contain list of unique sorted words in object * increase use of VARCHAR * constants takes on related additions * param, extensive db column size checks for metadata --- lib/sisu/v2/db_import.rb | 120 ++++++++++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 49 deletions(-) (limited to 'lib/sisu/v2/db_import.rb') diff --git a/lib/sisu/v2/db_import.rb b/lib/sisu/v2/db_import.rb index 45aca11b..e351f6fc 100644 --- a/lib/sisu/v2/db_import.rb +++ b/lib/sisu/v2/db_import.rb @@ -122,7 +122,7 @@ module SiSU_DB_import tell.puts_blue unless @opt.cmd =~/q/ tell=SiSU_Screen::Ansi.new(@opt.cmd,'Marshal Load',@fnc) tell.print_grey if @opt.cmd =~/v/ - select_first_match=%{ SELECT metadata_and_text.tid FROM metadata_and_text WHERE metadata_and_text.filename = '#{@opt.fns}'; } + select_first_match=%{ SELECT metadata_and_text.tid FROM metadata_and_text WHERE metadata_and_text.src_filename = '#{@opt.fns}'; } file_exist=@sql_type=~/sqlite/ \ ? @conn.get_first_value(select_first_match) \ : @conn.select_one(select_first_match) @@ -265,7 +265,10 @@ module SiSU_DB_import @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_minus special_character_escape(@col[:body]) @col[:plaintext]=@col[:body].dup - strip_markup(@col[:plaintext]) + @col[:plaintext]=strip_markup(@col[:plaintext]) + @col[:plaintext]=clean_searchable_text(@col[:plaintext]) + @col[:words]=@col[:plaintext].dup + @col[:words]=unique_words(@col[:words]) if @en[0]; @en_a,@en_z=@en[0].first,@en[0].last end if @en_ast[0]; @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last @@ -298,7 +301,10 @@ module SiSU_DB_import @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus special_character_escape(@col[:body]) @col[:plaintext]=@col[:body].dup - strip_markup(@col[:plaintext]) + @col[:plaintext]=strip_markup(@col[:plaintext]) + @col[:plaintext]=clean_searchable_text(@col[:plaintext]) + @col[:words]=@col[:plaintext].dup + @col[:words]=unique_words(@col[:words]) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -324,7 +330,10 @@ module SiSU_DB_import @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus special_character_escape(@col[:body]) @col[:plaintext]=@col[:body].dup - strip_markup(@col[:plaintext]) + @col[:plaintext]=strip_markup(@col[:plaintext]) + @col[:plaintext]=clean_searchable_text(@col[:plaintext]) + @col[:words]=@col[:plaintext].dup + @col[:words]=unique_words(@col[:words]) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -350,7 +359,10 @@ module SiSU_DB_import @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus special_character_escape(@col[:body]) @col[:plaintext]=@col[:body].dup - strip_markup(@col[:plaintext]) + @col[:plaintext]=strip_markup(@col[:plaintext]) + @col[:plaintext]=clean_searchable_text(@col[:plaintext]) + @col[:words]=@col[:plaintext].dup + @col[:words]=unique_words(@col[:words]) @en_a,@en_z=@en[0].first,@en[0].last if @en[0] @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -389,12 +401,15 @@ module SiSU_DB_import end special_character_escape(@col[:body]) @col[:plaintext]=@col[:body].dup - strip_markup(@col[:plaintext]) + @col[:plaintext]=strip_markup(@col[:plaintext]) + @col[:plaintext]=clean_searchable_text(@col[:plaintext]) + @col[:words]=@col[:plaintext].dup + @col[:words]=unique_words(@col[:words]) t=SiSU_DB_tuple::Load_documents.new(@conn,@col,@opt,@file) @tuple_array << t.tuple @en,@en_ast,@en_pls=[],[],[] @col[:en_a]=@col[:en_z]=nil - @col[:lev]=@col[:plaintext]=@col[:body]='' + @col[:lev]=@col[:plaintext]=@col[:body]=@col[:words]='' end if notedata =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ #% import into database endnotes tables endnote_array=notedata.scan(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/) @@ -406,9 +421,9 @@ module SiSU_DB_import @id_n+=1 special_character_escape(txt) body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt) - #special_character_escape(body) - #special_character_escape(txt) strip_markup(txt) + words=txt.dup + words=unique_words(words) if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1) puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| @@ -418,16 +433,17 @@ module SiSU_DB_import end if txt en={ :type => 'endnotes', - :id => @id_n, - :lid => @col[:lid], - :nr => nr, - :txt => txt, - :body => body, - :ocn => @col[:ocn], - :ocnd => @col[:ocnd], - :ocns => @col[:ocns], - :id_t => @@id_t, - :hash => digest_clean + :id => @id_n, + :lid => @col[:lid], + :nr => nr, + :txt => txt, + :body => body, + :words => words, + :ocn => @col[:ocn], + :ocnd => @col[:ocnd], + :ocns => @col[:ocns], + :id_t => @@id_t, + :hash => digest_clean } t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file) @tuple_array << t.tuple @@ -447,6 +463,8 @@ module SiSU_DB_import special_character_escape(txt) body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt) strip_markup(txt) + words=txt.dup + words=unique_words(words) if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1) puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| @@ -456,16 +474,17 @@ module SiSU_DB_import end if txt en={ :type => 'endnotes_asterisk', - :id => @id_n, - :lid => @col[:lid], - :nr => nr, - :txt => txt, - :body => body, - :ocn => @col[:ocn], - :ocnd => @col[:ocnd], - :ocns => @col[:ocns], - :id_t => @@id_t, - :hash => digest_clean + :id => @id_n, + :lid => @col[:lid], + :nr => nr, + :txt => txt, + :body => body, + :words => words, + :ocn => @col[:ocn], + :ocnd => @col[:ocnd], + :ocns => @col[:ocns], + :id_t => @@id_t, + :hash => digest_clean } t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file) @tuple_array << t.tuple @@ -485,6 +504,8 @@ module SiSU_DB_import special_character_escape(txt) body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt) strip_markup(txt) + words=txt.dup + words=unique_words(words) if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1) puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| @@ -494,16 +515,17 @@ module SiSU_DB_import end if txt en={ :type => 'endnotes_plus', - :id => @id_n, - :lid => @col[:lid], - :nr => nr, - :txt => txt, - :body => body, - :ocn => @col[:ocn], - :ocnd => @col[:ocnd], - :ocns => @col[:ocns], - :id_t => @@id_t, - :hash => digest_clean + :id => @id_n, + :lid => @col[:lid], + :nr => nr, + :txt => txt, + :body => body, + :words => words, + :ocn => @col[:ocn], + :ocnd => @col[:ocnd], + :ocns => @col[:ocns], + :id_t => @@id_t, + :hash => digest_clean } t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file) @tuple_array << t.tuple @@ -526,25 +548,25 @@ module SiSU_DB_import endnotes(@txt).range @en << endnotes(@txt).standard if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ @en_ast << endnotes(@txt).asterisk if @txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ - @en_pls << endnotes(@txt).plus if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_o]}/ + @en_pls << endnotes(@txt).plus if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/ @txt=endnotes(@txt).clean_text end @txt end def standard - x=if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/; @txt.scan(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/) - else nil - end + x=(@txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/) \ + ? @txt.scan(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/) \ + : nil end def asterisk - x=if @txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/; @txt.scan(/#{Mx[:en_b_o]}[*](\d+).+?#{Mx[:en_b_c]}/) - else nil - end + x=(@txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/) \ + ? @txt.scan(/#{Mx[:en_b_o]}[*](\d+).+?#{Mx[:en_b_c]}/) \ + : nil end def plus - x=if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/; @txt.scan(/#{Mx[:en_b_o]}[+](\d+).+?#{Mx[:en_b_c]}/) - else nil - end + x=(@txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/) \ + ? @txt.scan(/#{Mx[:en_b_o]}[+](\d+).+?#{Mx[:en_b_c]}/) \ + : nil end def clean_text(base_url=nil) if base_url -- cgit v1.2.3