diff options
Diffstat (limited to 'lib/sisu/develop/db_import.rb')
-rw-r--r-- | lib/sisu/develop/db_import.rb | 877 |
1 files changed, 877 insertions, 0 deletions
diff --git a/lib/sisu/develop/db_import.rb b/lib/sisu/develop/db_import.rb new file mode 100644 index 00000000..c221885c --- /dev/null +++ b/lib/sisu/develop/db_import.rb @@ -0,0 +1,877 @@ +# encoding: utf-8 +=begin + +* Name: SiSU + +** Description: documents, structuring, processing, publishing, search +*** modules shared by the different db types, dbi, postgresql, sqlite + +** Author: Ralph Amissah + <ralph@amissah.com> + <ralph.amissah@gmail.com> + +** Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, + 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Ralph Amissah, + All Rights Reserved. + +** License: GPL 3 or later: + + SiSU, a framework for document structuring, publishing and search + + Copyright (C) Ralph Amissah + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program. If not, see <http://www.gnu.org/licenses/>. + + If you have Internet connection, the latest version of the GPL should be + available at these locations: + <http://www.fsf.org/licensing/licenses/gpl.html> + <http://www.gnu.org/licenses/gpl.html> + + <http://www.sisudoc.org/sisu/en/manifest/gpl.fsf.html> + +** SiSU uses: + * Standard SiSU markup syntax, + * Standard SiSU meta-markup syntax, and the + * Standard SiSU object citation numbering and system + +** Hompages: + <http://www.jus.uio.no/sisu> + <http://www.sisudoc.org> + +** Git + <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=summary> + <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=blob;f=lib/sisu/develop/db_import.rb;hb=HEAD> + +=end +module SiSU_DbImport + require_relative 'db_columns' # db_columns.rb + require_relative 'db_load_tuple' # db_load_tuple.rb + require_relative 'db_sqltxt' # db_sqltxt.rb + require_relative 'html_lite_shared' # html_lite_shared.rb + class Import < SiSU_DbText::Prepare + include SiSU_Param + include SiSU_Screen + include SiSU_DbAction + @@dl=nil + @@hname=nil + attr_accessor :tp + def initialize(opt,conn,file_maint,sql_type=:pg) + @opt,@conn,@file_maint,@sql_type=opt,conn,file_maint,sql_type + @cX=SiSU_Screen::Ansi.new(@opt.act[:color_state][:set]).cX + @env=SiSU_Env::InfoEnv.new(@opt.fns) + @dal="#{@env.processing_path.ao}" + @fnb=if @opt.fns.empty? \ + or @opt.selections.str.empty? + '' + else + @md=SiSU_Param::Parameters.new(@opt).get + @md.fnb + end + @fnc="#{@dal}/#{@opt.fns}.content.rbm" + @@seg,@@seg_full='','' #create? consider placing field just before clean text as opposed to seg which contains seg(.html) name info seg_full would contain seg info for levels 5 & 6 where available eg seg_full may be 7.3 (level 5) and 7.3.1 (level 6) where seg is 7 + @col=Hash.new('') + @col[:ocn]='' + @counter={} + @db=SiSU_Env::InfoDb.new + if @sql_type==:sqlite + @driver_sqlite3= + (@conn.inspect.match(/^(.{10})/)[1] \ + == @db.sqlite.conn_sqlite3.inspect.match(/^(.{10})/)[1]) \ + ? true + : false + end + sql='SELECT MAX(lid) FROM doc_objects' + begin + @col[:lid] ||=0 + @col[:lid]=@driver_sqlite3 \ + ? @conn.execute( sql ).join.to_i + : @conn.exec( sql ).getvalue(0,0).to_i + rescue + if @opt.act[:maintenance][:set]==:on + puts "#{__FILE__}:#{__LINE__}" + end + end + @col[:lid]=0 if @col[:lid].nil? or @col[:lid].to_s.empty? + sql='SELECT MAX(nid) FROM endnotes' + begin + @id_n=@driver_sqlite3 \ + ? @conn.execute( sql ).join.to_i + : @conn.exec( sql ).getvalue(0,0).to_i + @id_n ||=0 + rescue + if @opt.act[:maintenance][:set]==:on + puts "#{__FILE__}:#{__LINE__}" + end + end + @id_n =0 if @col[:lid].nil? or @col[:lid].to_s.empty? + @col[:lv0]=@col[:lv1]=@col[:lv2]=@col[:lv3]=@col[:lv4]=@col[:lv5]=@col[:lv6]=@col[:lv7]=0 + @db=SiSU_Env::InfoDb.new + @pdf_fn=SiSU_Env::FileOp.new(@md).base_filename + @@dl ||=SiSU_Env::InfoEnv.new.digest.length + end + def marshal_load + require_relative 'ao' # ao.rb + @ao_array=SiSU_AO::Source.new(@opt).get # ao file drawn here + if (@opt.act[:verbose][:set]==:on \ + || @opt.act[:verbose_plus][:set]==:on \ + || @opt.act[:maintenance][:set]==:on) + SiSU_Screen::Ansi.new( + @opt.act[:color_state][:set], + "#{@db.psql.db}::#{@opt.fns}" + ).puts_blue + end + SiSU_Screen::Ansi.new( + @opt.act[:color_state][:set], + 'Marshal Load', + @fnc + ).puts_grey if @opt.act[:verbose][:set]==:on + select_first_match=%{ + SELECT metadata_and_text.tid + FROM metadata_and_text + WHERE metadata_and_text.src_filename = '#{@md.fns}' + AND metadata_and_text.language_document_char = '#{@opt.lng}' + ;} # note, for .ssm: @md.fns (is set during runtime & is) != @opt.fns @md.opt.fns + file_exist=if @sql_type==:sqlite + begin + @conn.get_first_value(select_first_match) + rescue SQLite3::Exception => e + # not tested + puts "Exception occurred" + puts e + SiSU_Utils::CodeMarker.new(__LINE__,__FILE__,:yellow).mark( + "\n" \ + + 'Attempting to initialize db' + "\n" \ + + 'Creating db tables' + ) + sdb={ + create: SiSU_DbDBI::Create.new(@opt,@conn,@file_maint,@sql_type), + index: SiSU_DbDBI::Index.new(@opt,@conn,@file_maint,@sql_type), + } + db_action(sdb).create + end + else + begin + @conn.exec(select_first_match).field_values("tid")[0] + rescue PG::Error => e + err=[ + e.result.error_field( PG::Result::PG_DIAG_SEVERITY ), + e.result.error_field( PG::Result::PG_DIAG_SQLSTATE ), + e.result.error_field( PG::Result::PG_DIAG_MESSAGE_PRIMARY ), + e.result.error_field( PG::Result::PG_DIAG_MESSAGE_DETAIL ), + e.result.error_field( PG::Result::PG_DIAG_MESSAGE_HINT ), + e.result.error_field( PG::Result::PG_DIAG_STATEMENT_POSITION ), + e.result.error_field( PG::Result::PG_DIAG_INTERNAL_POSITION ), + e.result.error_field( PG::Result::PG_DIAG_INTERNAL_QUERY ), + e.result.error_field( PG::Result::PG_DIAG_CONTEXT ), + e.result.error_field( PG::Result::PG_DIAG_SOURCE_FILE ), + e.result.error_field( PG::Result::PG_DIAG_SOURCE_LINE ), + e.result.error_field( PG::Result::PG_DIAG_SOURCE_FUNCTION ), + ] + p err + if err[2] =~/relation "\S+?" does not exist/ \ + or err.inspect =~/relation "\S+?" does not exist/ + SiSU_Utils::CodeMarker.new(__LINE__,__FILE__,:yellow).mark( + "\n" \ + + err[2] + "\n" \ + + 'Attempting to initialize db' + "\n" \ + + 'Creating db tables' + ) + sdb={ + create: SiSU_DbDBI::Create.new(@opt,@conn,@file_maint,@sql_type), + index: SiSU_DbDBI::Index.new(@opt,@conn,@file_maint,@sql_type), + } + db_action(sdb).create + retry + end + end + end + if not file_exist + t_d=[] # transaction_data + t_d << db_import_metadata + t_d << db_import_documents(@ao_array) + t_d << db_import_urls(@ao_array,@fnc) #import OID on/off + t_d=t_d.flatten + if (@opt.act[:verbose_plus][:set]==:on \ + || @opt.act[:maintenance][:set]==:on) + puts @conn.class if defined? @conn.class + puts @conn.driver_name if defined? @conn.driver_name + puts @conn.driver if defined? @conn.driver + end + begin #% sql + if @sql_type==:sqlite + @conn.transaction do |conn| + t_d.each do |sql| + conn.execute(sql) + end + end + #also 'execute' works for sqlite + #@conn.execute("BEGIN") + # t_d.each do |sql| + # @conn.execute(sql) + # end + #@conn.execute("COMMIT") + else + #'do' works for postgresql + @conn.exec("BEGIN") + t_d.each do |sql| + @conn.exec(sql) + end + @conn.exec("COMMIT") + end + rescue + SiSU_Errors::Rescued.new($!,$@,@opt.selections.str,@opt.fns).location do + __LINE__.to_s + ':' + __FILE__ + end + sqlfn="#{@env.processing_path.sql}/#{@md.fnb}.sql" + sql=File.new(sqlfn,'w') + t_d.each {|i| sql.puts i} + p sqlfn + if @opt.act[:maintenance][:set]==:on + puts sql + p @conn.methods.sort + puts "#{__FILE__}:#{__LINE__}" + end + ensure + end + else + if file_exist + @db=SiSU_Env::InfoDb.new + puts "\nfile #{@opt.fns} in language code #{@opt.lng} already exists in database #{@db.psql.db}, use --update instead?" + end + end + end + def pf_db_import_transaction_open + end + def pf_db_import_transaction_close + end + def book_idx_hash_to_str(book_idx) + book_idx=book_idx ? book_idx : '' + book_idx_str,book_subidx_part='','' + if not book_idx.empty? + book_idx_str='' + book_idx.each_pair do |k0,v0| + book_idx_str << %{#{k0}+#{v0[:plus]}} + book_subidx_part='' + if v0[:sub].length > 0 + v0[:sub].each do |subterms| + subterms.each_pair do |k1,v1| + book_subidx_part << %{\n #{k1}+#{v1[:plus]} | } + end + end + book_idx_str=book_idx_str + ':' + book_subidx_part + end + end + end + book_idx_str + end + def db_import_metadata #% import documents - populate database + if (@opt.act[:verbose][:set]==:on \ + || @opt.act[:verbose_plus][:set]==:on \ + || @opt.act[:maintenance][:set]==:on) + print %{ #{@cX.grey}import documents dbi_unit #{@cX.off} } + end + @tp={} + @md=SiSU_Param::Parameters.new(@opt).get +#% sisutxt & fulltxt + if FileTest.exist?(@md.fns) + txt_arr=IO.readlines(@md.fns,'') + src=txt_arr.join("\n") + src=special_character_escape(src) + @tp[:sisutxt_f],@tp[:sisutxt_i]='sisutxt, ',"'#{src}', " + txt=clean_searchable_text_from_document_source(txt_arr) + #txt=special_character_escape(txt) + @tp[:fulltxt_f],@tp[:fulltxt_i]='fulltxt, ',"'#{txt}', " + end +#% title + if defined? @md.title.full \ + and @md.title.full=~/\S+/ # DublinCore 1 - title + #@tp[:title]=@md.title.full + #special_character_escape(@tp[:title]) + #@tp[:title_f],@tp[:title_i]='title, ',"'#{@tp[:title]}', " + sql='SELECT MAX(tid) FROM metadata_and_text;' + begin + @@id_t ||=0 + id_t=@driver_sqlite3 \ + ? @conn.execute( sql ).join.to_i # { |x| id_t=x.join.to_i } + : @conn.exec( sql ).getvalue(0,0).to_i + @@id_t=id_t if id_t + rescue + if @opt.act[:maintenance][:set]==:on + puts "#{__FILE__} #{__LINE__}" + end + end + @@id_t+=1 #bug related, needs to be performed once at start of file, but consider moving, as, placed here it means program will fail if document header lacks @title: + if (@opt.act[:verbose][:set]==:on \ + || @opt.act[:verbose_plus][:set]==:on \ + || @opt.act[:maintenance][:set]==:on) + puts %{\n#{@cX.grey}Processing file number#{@cX.off}: #{@cX.green}#{@@id_t}#{@@cX.off}} + end + end + ################ CLEAR ############## + SiSU_DbDBI::Test.new(self,@opt).verify #% import title names, filenames (tuple) + t=SiSU_DbTuple::LoadMetadata.new(@conn,@@id_t,@md,@file_maint) + tuple=t.tuple + tuple + end + def db_import_documents(ao_array) #% import documents - populate main database table, import into substantive database tables (tuple) + begin + @col[:tid]=@@id_t + @en,@en_ast,@en_pls,@tuple_array=[],[],[],[] + @col[:en_a],@col[:en_z]=nil,nil + ao_array.each do |data| + data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1'). + gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1'). + gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1'). + gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1'). + gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1'). + gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1'). + gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1'). + gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1'). + gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1'). + gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 '). + gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check + @col[:seg]=@@seg + if data.of ==:para \ + || data.of ==:heading \ + || data.of ==:heading_insert \ + || data.of ==:block \ + || data.of ==:group # regular text what of code-blocks grouped text etc. + notedata=data.obj.dup + #% :headings + if data.is==:heading \ + && (data.ln.inspect=~/[0-3]/) + ( + @col[:lev], + txt,@col[:ocn], + @col[:lev_an], + @col[:ocnd],@col[:ocns], + @col[:t_of],@col[:t_is], + @col[:node],@col[:parent], + @col[:digest_clean],@col[:digest_all]= + data.ln, + data.obj,data.ocn, + data.lv, + data.odv,data.osp, + data.of,data.is, + data.node,data.parent, + '','' + ) + @col[:lid]+=1 + txt=endnotes(txt).extract_any + body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus + @col[:body]=clean_document_objects_body(body) + plaintext=@col[:body].dup + plaintext=strip_markup(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) + book_idx=book_idx_hash_to_str(data.idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) + if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last + end + if @en_ast[0] then @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last + end + if @en_pls[0] then @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last + end + t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) + @tuple_array << t.tuple + case @col[:lev] + when /0/ then @col[:lv0]+=1 + when /1/ then @col[:lv1]+=1 + when /2/ then @col[:lv2]+=1 + when /3/ then @col[:lv3]+=1 + when /4/ then @col[:lv4]+=1 + end + @col[:lev]=@col[:plaintext]=@col[:body]='' + elsif data.is==:heading \ + && data.ln==4 + ( + @@seg,txt, + @col[:ocn],@col[:lev_an], + @col[:ocnd],@col[:ocns], + @col[:t_of],@col[:t_is], + @col[:node],@col[:parent], + @col[:digest_clean],@col[:digest_all]= + data.name,data.obj, + data.ocn,data.lv, + data.odv,data.osp, + data.of,data.is, + data.node,data.parent, + '','' + ) + @col[:seg]=@@seg + @col[:lv4]+=1 + @col[:lid]+=1 + @col[:lev]=4 + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::InfoEnv.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + txt=endnotes(txt).extract_any + body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus + @col[:body]=clean_document_objects_body(body) + plaintext=@col[:body].dup + plaintext=strip_markup(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) + book_idx=book_idx_hash_to_str(data.idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) + @tuple_array << t.tuple + @col[:lev]=@col[:plaintext]=@col[:body]='' + elsif data.is==:heading \ + && data.ln==5 + ( + txt, + @col[:ocn],@col[:lev_an], + @col[:ocnd],@col[:ocns], + @col[:t_of],@col[:t_is], + @col[:node],@col[:parent], + @col[:digest_clean],@col[:digest_all]= + data.obj, + data.ocn,data.lv, + data.odv,data.osp, + data.of,data.is, + data.node,data.parent, + '','' + ) + @@seg_full=data.name if data.is==:heading \ + && data.ln==5 \ + && data.name #check data.name + @@seg ||='' #nil # watch + @col[:seg]=@@seg + @col[:lv5]+=1 + @col[:lid]+=1 + @col[:lev]=5 + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::InfoEnv.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + txt=endnotes(txt).extract_any + body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus + @col[:body]=clean_document_objects_body(body) + plaintext=@col[:body].dup + plaintext=strip_markup(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) + book_idx=book_idx_hash_to_str(data.idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) + @tuple_array << t.tuple + @col[:lev]=@col[:plaintext]=@col[:body]='' + elsif data.is==:heading \ + && data.ln==6 + txt, @col[:ocn],@col[:lev_an],@col[:ocnd],@col[:ocns],@col[:t_of],@col[:t_is],@col[:node],@col[:parent],@col[:digest_clean],@col[:digest_all]= + data.obj,data.ocn, data.lv, data.odv, data.osp, data.of, data.is, data.node, data.parent, '', '' + @@seg_full=data.name if data.is==:heading && data.ln==6 && data.name #check data.name + @@seg ||='' #nil # watch + @col[:seg]=@@seg + @col[:lv6]+=1 + @col[:lid]+=1 + @col[:lev]=6 + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::InfoEnv.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + txt=endnotes(txt).extract_any + body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus + @col[:body]=clean_document_objects_body(body) + plaintext=@col[:body].dup + plaintext=strip_markup(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) + book_idx=book_idx_hash_to_str(data.idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) + @tuple_array << t.tuple + @col[:lev]=@col[:plaintext]=@col[:body]='' + elsif data.is==:heading \ + && data.ln==7 + txt, @col[:ocn],@col[:lev_an],@col[:ocnd],@col[:ocns],@col[:t_of],@col[:t_is],@col[:node],@col[:parent],@col[:digest_clean],@col[:digest_all]= + data.obj,data.ocn, data.lv, data.odv, data.osp, data.of, data.is, data.node, data.parent, '', '' + @@seg_full=data.name if data.is==:heading && data.ln==7 && data.name #check data.name + @@seg ||='' #nil # watch + @col[:seg]=@@seg + @col[:lv7]+=1 + @col[:lid]+=1 + @col[:lev]=7 + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::InfoEnv.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + txt=endnotes(txt).extract_any + body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus + @col[:body]=clean_document_objects_body(body) + plaintext=@col[:body].dup + plaintext=strip_markup(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) + book_idx=book_idx_hash_to_str(data.idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) + @tuple_array << t.tuple + @col[:lev]=@col[:plaintext]=@col[:body]='' + #% :structure :layout :comment + elsif data.of==:structure \ + || data.of==:layout \ + || data.of==:comment + #added watch + #% : + else #% regular text + @col[:lid]+=1 + ( + txt='' + txt,@col[:ocn], + @col[:ocnd],@col[:ocns], + @col[:t_of],@col[:t_is], + @col[:node],@col[:parent], + @col[:digest_clean],@col[:digest_all], + @col[:lev]= + data.obj,data.ocn, + data.odv,data.osp, + data.of,data.is, + '',data.parent, + '','', + 9 + ) + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::InfoEnv.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + txt=endnotes(txt).extract_any + if @sql_type==:pg \ + and txt.size > (SiSU_DbColumns::ColumnSize.new.document_clean - 1) # examine pg build & remove limitation + puts "\n\nTOO LARGE (TXT - see error log)\n\n" + open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| + error.puts("\n#{@opt.fns}\nTEXT BODY\n#{@col[:body].size} object #{@col[:ocn]} -> #{@col[:body].slice(0..500)}") + end + txt=%{\n\nLARGE TEXT BLOCK OMITTED\n\n} + end + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + body=if data.is==:table + SiSU_FormatShared::CSS_Format.new(@md,data).html_table + elsif data.is==:code + SiSU_FormatShared::CSS_Format.new(@md,data).code + elsif defined? data.indent \ + and defined? data.hang \ + and data.indent =~/[1-9]/ \ + and data.indent == data.hang + SiSU_FormatShared::CSS_Format.new(@md,data).indent(data.indent) + elsif defined? data.indent \ + and defined? data.hang \ + and data.hang =~/[0-9]/ \ + and data.indent != data.hang + SiSU_FormatShared::CSS_Format.new(@md,data).hang_indent(data.hang,data.indent) + else + SiSU_FormatShared::CSS_Format.new(@md,data).norm + end + @col[:body]=clean_document_objects_body(body) + plaintext=@col[:body].dup + plaintext=strip_markup(plaintext) + @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) + book_idx=book_idx_hash_to_str(data.idx) + @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx) + t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint) + @tuple_array << t.tuple + @en,@en_ast,@en_pls=[],[],[] + @col[:en_a]=@col[:en_z]=nil + @col[:lev]=@col[:plaintext]=@col[:body]=@col[:words]='' + end + if notedata =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ #% import into database endnotes tables + endnote_array=notedata.scan(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/) + endnote_array.each do |inf| + if inf[/#{Mx[:en_a_o]}\d+.+?#{Mx[:en_a_c]}/] + if inf[/#{Mx[:en_a_o]}(\d+)(.+?)#{Mx[:en_a_c]}/] + nr,txt,digest_clean=$1,$2.strip,0 + end + @id_n ||=0 + @id_n+=1 + txt=special_character_escape(txt) + body=SiSU_FormatShared::CSS_Format.new(@md,data).endnote(nr,txt) + txt=strip_markup(txt) + if txt.size > (SiSU_DbColumns::ColumnSize.new.endnote_clean - 1) + puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" + open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| + error.puts("\n#{@opt.fns}\nENDNOTE\n#{txt.size} object #{@col[:ocn]},#{@col[:ocnd]},#{@col[:ocns]} -> #{txt.slice(0..500)}") + end + txt=%{\n\nLARGE TEXT BLOCK OMITTED\n\n} + end + if txt + en={ + type: 'endnotes', + id: @id_n, + lid: @col[:lid], + nr: nr, + txt: txt, + body: body, + ocn: @col[:ocn], + ocnd: @col[:ocnd], + ocns: @col[:ocns], + id_t: @@id_t, + hash: digest_clean + } + t=SiSU_DbTuple::LoadEndnotes.new(@conn,en,@opt,@file_maint) + @tuple_array << t.tuple + end + end + end + word_mode=notedata.scan(/\S+/) + end + if notedata =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ #% import into database endnotes tables + endnote_array=notedata.scan(/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/) + endnote_array.each do |inf| + if inf[/#{Mx[:en_b_o]}\*\d+.+?#{Mx[:en_b_c]}/] # dal new endnotes 2003w31/1 + if inf[/#{Mx[:en_b_o]}[*](\d+)(.+?)#{Mx[:en_b_c]}/] # dal new endnotes 2003w31/1 + nr,txt,digest_clean=$1,$2.strip,0 + end + @id_n+=1 + txt=special_character_escape(txt) + body=SiSU_FormatShared::CSS_Format.new(@md,data).endnote(nr,txt) + txt=strip_markup(txt) + if txt.size > (SiSU_DbColumns::ColumnSize.new.endnote_clean - 1) + puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" + open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| + error.puts("\n#{@opt.fns}\nENDNOTE\n#{txt.size} object #{@col[:ocn]},#{@col[:ocnd]},#{@col[:ocns]} -> #{txt.slice(0..500)}") + end + txt=%{\n\nLARGE TEXT BLOCK OMITTED\n\n} + end + if txt + en={ + type: 'endnotes_asterisk', + id: @id_n, + lid: @col[:lid], + nr: nr, + txt: txt, + body: body, + ocn: @col[:ocn], + ocnd: @col[:ocnd], + ocns: @col[:ocns], + id_t: @@id_t, + hash: digest_clean + } + t=SiSU_DbTuple::LoadEndnotes.new(@conn,en,@opt,@file_maint) + @tuple_array << t.tuple + end + end + end + word_mode=notedata.scan(/\S+/) + end + if notedata =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/ #% import into database endnotes tables + endnote_array=notedata.scan(/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/) + endnote_array.each do |inf| + if inf[/#{Mx[:en_b_o]}\+\d+.+?#{Mx[:en_b_c]}/] # dal new endnotes 2003w31/1 + if inf[/#{Mx[:en_b_o]}[+](\d+)(.+?)#{Mx[:en_b_c]}/] # dal new endnotes 2003w31/1 + nr,txt,digest_clean=$1,$2.strip,0 + end + @id_n+=1 + txt=special_character_escape(txt) + body=SiSU_FormatShared::CSS_Format.new(@md,data).endnote(nr,txt) + txt=strip_markup(txt) + if txt.size > (SiSU_DbColumns::ColumnSize.new.endnote_clean - 1) + puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" + open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| + error.puts("\n#{@opt.fns}\nENDNOTE\n#{txt.size} object #{@col[:ocn]},#{@col[:ocnd]},#{@col[:ocns]} -> #{txt.slice(0..500)}") + end + txt=%{\n\nLARGE TEXT BLOCK OMITTED\n\n} + end + if txt + en={ + type: 'endnotes_plus', + id: @id_n, + lid: @col[:lid], + nr: nr, + txt: txt, + body: body, + ocn: @col[:ocn], + ocnd: @col[:ocnd], + ocns: @col[:ocns], + id_t: @@id_t, + hash: digest_clean + } + t=SiSU_DbTuple::LoadEndnotes.new(@conn,en,@opt,@file_maint) + @tuple_array << t.tuple + end + end + end + word_mode=notedata.scan(/\S+/) + end + end + end + rescue + SiSU_Errors::Rescued.new($!,$@,@opt.selections.str,@opt.fns).location do + __LINE__.to_s + ':' + __FILE__ + end + ensure + end + @tuple_array + end + def endnotes(txt) + @txt=txt + def extract_any + if @txt =~/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})[*+]?(\d+)\s+.+?(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/ + endnotes(@txt).range + @en << endnotes(@txt).standard if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ + @en_ast << endnotes(@txt).asterisk if @txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ + @en_pls << endnotes(@txt).plus if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/ + @txt=endnotes(@txt).clean_text + end + @txt + end + def standard + (@txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/) \ + ? @txt.scan(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/) + : nil + end + def asterisk + (@txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/) \ + ? @txt.scan(/#{Mx[:en_b_o]}[*](\d+).+?#{Mx[:en_b_c]}/) + : nil + end + def plus + (@txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/) \ + ? @txt.scan(/#{Mx[:en_b_o]}[+](\d+).+?#{Mx[:en_b_c]}/) + : nil + end + def clean_text(base_url=nil) + @txt=if base_url + @txt.gsub(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/,%{<sup><a href="#{base_url}#_\\1" name="-\\1">\\1</a></sup>}). + gsub(/#{Mx[:en_b_o]}([*]\d+).+?#{Mx[:en_b_c]}/,%{<sup><a href="#{base_url}#_\\1" name="-\\1">\\1</a></sup>}). + gsub(/#{Mx[:en_b_o]}([+]\d+).+?#{Mx[:en_b_c]}/,%{<sup><a href="#{base_url}#_\\1" name="-\\1">\\1</a></sup>}) + else + @txt.gsub(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/,'<sup>\1</sup>'). + gsub(/#{Mx[:en_b_o]}([*]\d+).+?#{Mx[:en_b_c]}/,'<sup>\1</sup>'). + gsub(/#{Mx[:en_b_o]}([+]\d+).+?#{Mx[:en_b_c]}/,'<sup>\1</sup>') + end + @txt + end + def range + @col[:en_a]=@col[:en_z]=nil + if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|#{Mx[:en_b_o]}([*]\d+).+?#{Mx[:en_b_c]}|#{Mx[:en_b_o]}([+]\d+).+?#{Mx[:en_b_c]}/ + word_array=@txt.scan(/\S+/) + word_array.each do |w| + if w[/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})[*+]?(\d+)\s+.+?(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/] # not tested since change 2003w31 + @col[:en_a]=$1 unless @col[:en_a] + @col[:en_z]=@col[:en_a].dup unless @col[:en_a] + @col[:en_z]=$1 if @col[:en_a] + end + end + end + @col + end + self + end + def db_import_urls(dbi_unit,content) #% import documents OID - populate database + begin + @fnc=content + @env=SiSU_Env::InfoEnv.new(@opt.fns) + f,u={},{} + if @fnb.empty? \ + or @fnb.nil? + p 'file output path error' #remove + end + if FileTest.file?("#{@md.file.output_path.txt.dir}/#{@md.file.base_filename.txt}")==true + f[:txt],u[:txt]='plaintext,', "'#{@md.file.output_path.txt.url}/#{@md.file.base_filename.txt}'," + end + if FileTest.file?("#{@md.file.output_path.html_seg.dir}/#{@md.file.base_filename.html_segtoc}")==true + f[:html_toc],u[:html_toc]='html_toc,', "'#{@md.file.output_path.html_seg.url}/#{@md.file.base_filename.html_segtoc}'," + end + if FileTest.file?("#{@md.file.output_path.html_scroll.dir}/#{@md.file.base_filename.html_scroll}")==true + f[:html_doc],u[:html_doc]='html_doc,', "'#{@md.file.output_path.html_scroll.url}/#{@md.file.base_filename.html_scroll}'," + end + if FileTest.file?("#{@md.file.output_path.xhtml.dir}/#{@md.file.base_filename.xhtml}")==true + f[:xhtml],u[:xhtml]='xhtml,', "'#{@md.file.output_path.xhtml.url}/#{@md.file.base_filename.xhtml}'," + end + if FileTest.file?("#{@md.file.output_path.xml_sax.dir}/#{@md.file.base_filename.xml_sax}")==true + f[:xml_sax],u[:xml_sax]='xml_sax,', "'#{@md.file.output_path.xml_sax.url}/#{@md.file.base_filename.xml_sax}'," + end + if FileTest.file?("#{@md.file.output_path.xml_dom.dir}/#{@md.file.base_filename.xml_dom}")==true + f[:xml_dom],u[:xml_dom]='xml_dom,', "'#{@md.file.output_path.xml_dom.url}/#{@md.file.base_filename.xml_dom}'," + end + if FileTest.file?("#{@md.file.output_path.epub.dir}/#{@md.file.base_filename.epub}")==true + f[:epub],u[:epub]='epub,', "'#{@md.file.output_path.epub.url}/#{@md.file.base_filename.epub}'," + end + if FileTest.file?("#{@md.file.output_path.odt.dir}/#{@md.file.base_filename.odt}")==true + f[:odf],u[:odf]='odf,', "'#{@md.file.output_path.odt.url}/#{@md.file.base_filename.odt}'," + end + if FileTest.file?("#{@md.file.output_path.pdf.dir}/#{@pdf_fn.pdf_p_a4}")==true #\ + #or FileTest.file?("#{@md.file.output_path.pdf.dir}/#{@pdf_fn.pdf_p_letter}")==true + f[:pdf_p],u[:pdf_p]='pdf_p,', "'#{@md.file.output_path.pdf.url}/#{@pdf_fn.pdf_p_a4}'," + end + if FileTest.file?("#{@md.file.output_path.pdf.dir}/#{@pdf_fn.pdf_l_a4}")==true #\ + #or FileTest.file?("#{@md.file.output_path.pdf.dir}/#{@pdf_fn.pdf_l_letter}")==true + f[:pdf_l],u[:pdf_l]='pdf_l,', "'#{@md.file.output_path.pdf.url}/#{@pdf_fn.pdf_l_a4}'," + end + if FileTest.file?("#{@md.file.output_path.html_concordance.dir}/#{@md.file.base_filename.html_concordance}")==true + f[:concordance],u[:concordance]='concordance,', "'#{@md.file.output_path.html_concordance.url}/#{@md.file.base_filename.html_concordance}'," + end + #if FileTest.file?("#{@md.file.output_path.x.dir}/#{@md.file.base_filename.x}")==true + # f[:latex_p],u[:latex_p]='latex_p,', "'#{@md.file.output_path.x.url}/#{@md.file.base_filename.x}'," + #end + ##if FileTest.file?("#{out}/#{@fnb}/#{@opt.fns}.tex")==true + ## f[:latex_p],u[:latex_p]='latex_p,', "'#{base}/#{@fnb}/#{@opt.fns}.tex'," + ##end + #if FileTest.file?("#{@md.file.output_path.x.dir}/#{@md.file.base_filename.x}")==true + # f[:latex_l],u[:latex_l]='latex_l,', "'#{@md.file.output_path.x.url}/#{@md.file.base_filename.x}'," + #end + ##if FileTest.file?("#{out}/#{@fnb}/#{@opt.fns}.landscape.tex")==true + ## f[:latex_l],u[:latex_l]='latex_l,', "'#{base}/#{@fnb}/#{@opt}.fns}.landscape.tex'," + ##end + if FileTest.file?("#{@md.file.output_path.digest.dir}/#{@md.file.base_filename.digest}")==true + f[:digest],u[:digest]='digest,', "'#{@md.file.output_path.digest.url}/#{@md.file.base_filename.digest}'," + end + if FileTest.file?("#{@md.file.output_path.manifest.dir}/#{@md.file.base_filename.manifest}")==true #revisit, was to be text, this is html + f[:manifest],u[:manifest]='manifest,', "'#{@md.file.output_path.manifest.url}/#{@md.file.base_filename.manifest}'," + end + if FileTest.file?("#{@md.file.output_path.src.dir}/#{@md.file.base_filename.src}")==true + f[:markup],u[:markup]='markup,', "'#{@md.file.output_path.src.url}/#{@md.file.base_filename.src}'," + end + if FileTest.file?("#{@md.file.output_path.sisupod.dir}/#{@md.file.base_filename.sisupod}")==true + f[:sisupod],u[:sisupod]='sisupod,', "'#{@md.file.output_path.sisupod.url}/#{@md.file.base_filename.sisupod}'," + end + t=SiSU_DbTuple::LoadUrls.new(@conn,f,u,@@id_t,@opt,@file_maint) + tuple=t.tuple + rescue + SiSU_Errors::Rescued.new($!,$@,@opt.selections.str,@opt.fns).location do + __LINE__.to_s + ':' + __FILE__ + end + ensure + end + tuple + end + end +end +__END__ |