=begin * Name: SiSU * Description: a framework for document structuring, publishing and search * Author: Ralph Amissah * Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 Ralph Amissah All Rights Reserved. * License: GPL 3 or later: SiSU, a framework for document structuring, publishing and search Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 Ralph Amissah This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . If you have Internet connection, the latest version of the GPL should be available at these locations: * SiSU uses: * Standard SiSU markup syntax, * Standard SiSU meta-markup syntax, and the * Standard SiSU object citation numbering and system * Hompages: * Download: * Ralph Amissah ** Description: preprocessing, (document abstraction), data abstraction used in subsequent processing =end module SiSU_DAL require "#{SiSU_lib}/defaults" require "#{SiSU_lib}/sysenv" require "#{SiSU_lib}/param" require "#{SiSU_lib}/dal_syntax" require "#{SiSU_lib}/dal_doc_str" require "#{SiSU_lib}/i18n" include SiSU_Env include SiSU_Param include SiSU_Viz include Syntax class Instantiate < SiSU_Param::Parameters::Instructions def initialize @@flag_vocab=0 @@endnote={} @@endnote_array=@@word_mode=[] @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 #added @@line_mode='' end end class Source #{@my_make_fns.meta}") if @md.cmd =~/M/ tell.txt_grey unless @md.cmd =~/q/ dal.each{|s| dal_array << "#{s.strip}\n\n" unless s.strip.empty?} dal_array end def read_fnm dal=[] dal=if FileTest.file?(@fnm); File.open(@fnm){ |f| dal=Marshal.load(f)} else SiSU_DAL::Source.new(@opt).create_dal end end end class Output def initialize(md,data) @md,@data=md,data @my_make=SiSU_Env::Create_file.new(@md.cmd,@md.fns) dir=SiSU_Env::Info_env.new(@md.fns) @hard="#{dir.path.dal}/#{@md.fns}.meta" end def hard_output if @md.cmd =~/M/ filename_meta=@my_make.file_meta @data.each {|s| filename_meta.puts s.strip + "\n\n" unless s.strip.empty?} else File.unlink(@hard) if FileTest.file?(@hard) end end def marshal marshal_meta=@my_make.marshal_meta File.open(marshal_meta,'w'){|f| Marshal.dump(@data.to_a,f)} end end class Make @@endnote={} @@endnote_array=@@word_mode=[] @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 @@comment='%' @@dp=nil def initialize(md,data) @md,@data=md,data @@word_mode=[] @env=SiSU_Env::Info_env.new(@md.fns) @skin=SiSU_Env::Info_skin.new(@md) @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern l=SiSU_Env::Standardise_language.new.file_to_language(@md.fns) @language=l[:l] @tr=SiSU_Translate::Source.new(@md,@language) end def reset @@flag_vocab=0 @@endnote={} @@endnote_array=@@word_mode=[] @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 @@line_mode='' end def song reset data=@data @metafile="#{@env.path.dal}/#{@md.fns}.meta" my_make_source_file=SiSU_Env::Create_file.new(@md.cmd,@md.fns) data=data.join.split("\n\n") data=expand_insertions?(data) data=SiSU_document_structure::Code.new(@md,data).code data_new=[] data.each do |x| data_new << if x =~ /\n\n/m; x.split(/\n\n+/) else x end end data=data_new.flatten data=substitutions_and_insertions?(data) data=Syntax::Markup.new(@md,data).songsheet data=character_check(data) data=images(data) data=SiSU_document_structure::Tables.new(@md,data).tables data=numbering_song(data) #tr issue data=endnotes(data) data=object_digest(data) meta=metadata(data) outputdata=data + meta if @md.cmd =~/[mM]/ SiSU_DAL::Output.new(@md,outputdata).hard_output SiSU_DAL::Output.new(@md,outputdata).marshal end reset outputdata end protected def character_check(data) require 'iconv' reset @tuned_file=[] endnote_no=1 data.each do |para| para.strip! para.gsub!(/^([12])~\?\s+/,'\1~ ') #conditional header for incorporated document 2004w12 para.gsub!(/^[{~}]\s*$/,'') para.gsub!(/^#{@@comment}.*/,'') #remove comment and divider #% para.gsub!(/<~#>|~#\s*/,'<~#>') para.gsub!(/-#\s*/,'<-#><~#>') #para.gsub!(/(#\{{3} arch-tag:|0\{{3}~cvs)\s+/, "0{{~rcs ") #KEEP ... ENABLE WIDER USE OF REVISION CONTROL para.gsub!(/(~\{ )\s+/,'\1') para.gsub!(/ \/\//,'
') #added 2004w29 para.gsub!(/
/,'
') #needed by xml, xhtml etc. #para.gsub!(/

/,'

') #consider para.gsub!(/`/,"'") para.gsub!(/\342\200\231/,"'") #if para =~/’/ #Avoid #‘ ’ #“ ” para.gsub!(/\t/,' ') para.gsub!(/�/,' ') #watch, replace with char code para.gsub!(/[“”]/,'""') para.gsub!(/[­–—]/,'-') #— – chk para.gsub!(/·/,'*') para.gsub!(/\\copy(?:right)?\b/,'©') para.gsub!(/\\trademark\b|\\tm\b/,'®') #non_utf8(para) para=para + "\n" unless para =~/^<:code>/ case para when /\^~/ # endnotes #% Note must do this first (earlier loop) and then enter gathered data into ~^\d+ sub_para=para.dup @@endnote_array << sub_para.gsub!(/\n/,'').gsub!(/\^~\s+(.+)\s*/, %{~\{#{endnote_no} \\1 \}~}).strip endnote_no+=1 para=nil if para =~/\^~ .+/ #removes 'binary' endnote now in endnote array for later insertion end end @tuned_file << para unless para.nil? end @tuned_file=@tuned_file.flatten end def images(data) tuned_file=[] @rmgk=false if SiSU_Env::Info_settings.new.program?('rmagick'); @rmgk=SiSU_Env::Load.new('RMagick').prog else tell=SiSU_Screen::Ansi.new(@md.cmd,'use of RMagick is not enabled in sisurc.yml') tell.warn if @md.cmd =~/[vVM]/ end data.each do |para| para.strip! if para =~/\{\s*\S+\.(?:png|jpg|gif)(?:\s*|\s+.+)?\}(?:(?:https?|file|ftp):\S+|image)/ if para !~/\{\s*\S+\.(?:png|jpg|gif)\s+\d+x\d+\s+/ m=/\{\s*(\S+\.(?:png|jpg|gif))/ if @rmgk imgs=para.scan(m).flatten images=imgs.each do |image| dir=SiSU_Env::Info_env.new(@md.fns) path_image=[dir.path.image_source_local_tex,dir.path.image_source_remote_tex,dir.path.image_source_tex] image_path=nil path_image.each do |image_path| break if FileTest.exist?("#{image_path}/#{image}") end if FileTest.exist?("#{image_path}/#{image}") img=Magick::ImageList.new("#{image_path}/#{image}") img_col,img_row=img.columns,img.rows if img_col > img_row #landscape if img_col> 640 #480 img_col=640 #480 img_row=((1.00*img_col/img.columns)*img.rows).round end else #portrait if img_col> 640 #480 img_col=640 #480 img_row=((1.00*img_col/img.columns)*img.rows).round end if img_row > 640 img_row=640 img_col=((1.00*img_row/img.rows)*img.columns).round end end para.gsub!(/(#{image})/,"#{image} #{img_col}x#{img_row}") else para.gsub!(/\{\s*(\S+)\.(png|jpg|gif).+?\}((?:https?|file|ftp):\S+|image)/,'[ \1 (\2 missing) ]') end end else images=para.scan(m) do |image| tell=SiSU_Screen::Ansi.new(@md.cmd,'where image dimensions have not been provided RMagick is required',image) tell.warn #unless @opt.cmd =~/q/ end end end end para.gsub!(/\{\s+(\S+\.(?:png|jpg|gif))\s+/i,'{\1 ') if para =~/\{\s+\S+\.(?:png|jpg|gif).+?\}(?:(?:https?|file|ftp):\S+|image)/ tuned_file << para unless para.nil? end tuned_file end def output_filetypes_in_cmd(cmd_shortcut,source=nil) #make list of file types in shortcut command (as configured), e.g. when sisu -3 is used cf_defaults=SiSU_Env::Info_processing_flag.new cmd_list=case cmd_shortcut.to_s when /0/; cf_defaults.cf_0 when /1/; cf_defaults.cf_1 when /2/; cf_defaults.cf_2 when /3/; cf_defaults.cf_3 when /4/; cf_defaults.cf_4 when /5/; cf_defaults.cf_5 end file_type_names={} file_type_names[:gen],file_type_names[:src]=[],[] file_type_names[:gen] <<= if cmd_list =~ /y/; 'sisu_manifest.html' end file_type_names[:gen] <<= if cmd_list =~ /h/; ['toc.html', 'doc.html'] end file_type_names[:gen] <<= if cmd_list =~ /p/; ['landscape.pdf', 'portrait.pdf'] end #file_type_names[:gen] <<= if cmd_list =~ /i/; 'manpage.1' #end file_type_names[:gen] <<= if cmd_list =~ /o/; 'opendocument.odt' end file_type_names[:gen] <<= if cmd_list =~ /b/; 'scroll.xhtml' end file_type_names[:gen] <<= if cmd_list =~ /x/; 'sax.xml' end file_type_names[:gen] <<= if cmd_list =~ /X/; 'dom.xml' end file_type_names[:gen] <<= if cmd_list =~ /a/; 'plain.txt' end file_type_names[:gen] <<= if cmd_list =~ /g/; 'wiki.txt' end file_type_names[:gen] <<= if cmd_list =~ /w/; 'concordance.html' end file_type_names[:gen] <<= if cmd_list =~ /N/; 'digest.txt' end file_type_names[:src] <<= if source and cmd_shortcut =~ /s/; source end file_type_names[:src] <<= if cmd_shortcut =~ /S/; "#{source}.zip" end file_type_names[:gen]=file_type_names[:gen].flatten file_type_names[:src]=file_type_names[:src].flatten file_type_names end def expand_insertions?(data) tuned_file,tuned_file_tmp=[],[] data.each do |para| if para !~/^%+\s/ and para =~/\{(?:~\^\s+)?(.+?)\s\[(?:\d(?:[sS]*))\]\}(?:\.\.\/\S+?\/|\S+?\.ss[tm]\b)/ txt,cmd,source,url_dir,note,manifest=nil,nil,nil,nil,nil,nil @u=SiSU_Env::Info_env.new.url if defined? @u.remote if para =~/(.+?)\{(.+?)\s\[(\d[sS]*)\]\}((\S+?)\.ss[tm]\b)(.*)/m pre,txt,cmd,source,url_dir,note="#{$1.strip} ",$2,$3,$4,$5,$6 elsif para =~/\{(.+?)\s\[(\d[sS]*)\]\}((\S+?)\.ss[tm]\b)(.*)/ pre,txt,cmd,source,url_dir,note='',$1,$2,$3,$4,$5 end manifest="#{pre}{#{txt} }#{@u.remote}/#{url_dir}/toc.html#{note}\n\n" else puts "error, does currently support relative paths (reltive paths were removed, as had problems for citation, and was not suited to all output types should possibly reconsider) #{__FILE__} #{__LINE__}" if para =~/\{(?:~\^\s+)?(.+?)\s\[(\d[sS]*)\]\}\.\.\/(\S+?)\/(\s+~\{.+?\}~)?/ txt,cmd,url_dir,note=$1,$2,$3,$4 manifest="{ #{txt} }../#{url_dir}/toc.html#{note}\n\n" end end tuned_file_tmp << manifest output_filetypes=output_filetypes_in_cmd(cmd,source) output_filetypes[:gen].each do |o_f| describe = case o_f when /sisu_manifest.html/; '~^ document manifest' when /toc.html/; ' html, segmented text' when /doc.html/; ' html, scroll, document in one' when /landscape.pdf/; ' pdf, landscape' when /portrait.pdf/; ' pdf, portrait' when /opendocument.odt/; ' odf:odt, open document text' when /scroll.xhtml/; ' xhtml scroll' when /sax.xml/; ' xml, sax' when /dom.xml/; ' xml, dom' when /plain.txt/; ' plain text utf-8' #when /manpage.1/; ' man, 1' when /wiki.txt/; ' wiki text' when /concordance.html/; ' concordance' when /digest.txt/; ' dcc, document content certificate (digests)' else nil end if describe tuned_file_tmp << if @u.remote #to double space <:br> at beginning of entry "     { #{describe} }#{@u.remote}/#{url_dir}/#{o_f} " else if describe =~/^~\^ / "     {#{describe} }../#{url_dir}/#{o_f} " else "     { #{describe} }../#{url_dir}/#{o_f} " end end end end output_filetypes[:src].each do |o_f| describe=case o_f when /#{source}\.zip/; ' markup source (zipped) pod' when /#{source}/; ' markup source text' else nil end if describe tuned_file_tmp << if @u.remote x=if describe =~/zip/ "     {#{describe} }#{@u.src_pod}/#{o_f} " else "     {#{describe} }#{@u.src_txt}/#{o_f} " end else x=if describe =~/zip/ "     { #{describe} }../pod/#{o_f} " else "     { #{describe} }../zip/#{o_f} " end end end end tuned_file << 'group{' << tuned_file_tmp.join("\n") << '}group' #tuned_file << 'group{' << tuned_file_tmp.join("\n").strip << '}group' tuned_file_tmp=[] else tuned_file << para end end tuned_file end def substitutions_and_insertions?(data) tuned_file=[] if data[0] =~ /^#!\s*(?:\/usr\/bin\/env sisu|\/usr\/bin\/sisu)/ # remove bang from top #! (however file is stripped, so will be removed provided no content preceeds it) data[0].gsub!(/^#!\s*\/usr\/bin\/sisu/,'') data[0].gsub!(/^#!\s*\/usr\/bin\/env sisu/,'') end if data[0] =~ /^(SiSU\s+[\d.]*|sisu-[\d.]+)$/ # SiSU identifier data[0].gsub!(/^(SiSU\s*[\d.]*)$/,'% \1') data[0].gsub!(/^(sisu-[\d.]+)$/,'% \1') end data.each do |para| para=if @md.markup_version.to_f >= 0.38 SiSU_document_structure::Structure.new(@md,para).structure_markup_normalize else para end #para.gsub!(//,'\1') #consider, would permit use of text hyperlinks if desired, dal_syntax more appropriate? para.gsub!(/^((?:[1-9]|:?[A-C])~\S*)\s*$/,'\1~ [Note: heading marker::required title missing]~#') #conditional header for incorporated document 2004w12 if para =~/^@\S+?:/ para.gsub!(/^@(\S+?):\s+/,'0~\1 ') para.gsub!(/^@(\S+?):([+-])\s+/,'0~\1\2 ') end if para =~/<:insert\d+!?>/ \ and para !~/^%\s+/ @skin.select ins=SiSU_Viz::Inserts.new case para when /^\s*<:insert1>\s*$/ para=[] ins.insert1.split(/\n\n/).each{|x| para << x } when /^\s*<:insert2>\s*$/ para=[] ins.insert2.split(/\n\n/).each{|x| para << x } when /^\s*<:insert3>\s*$/ para=[] ins.insert3.split(/\n\n/).each{|x| para << x << "\n"} para=ins.insert3 when /^\s*<:insert4>\s*$/ para=[] ins.insert4.split(/\n\n/).each{|x| para << x << "\n"} para=ins.insert4 when /^\s*<:insert5>\s*$/ para=[] ins.insert5.split(/\n\n/).each{|x| para << x << "\n"} when /^\s*<:insert6>\s*$/ para=[] ins.insert6.split(/\n\n/).each{|x| para << x << "\n"} when /^\s*<:insert7>\s*$/ para=[] ins.insert7.split(/\n\n/).each{|x| para << x << "\n"} end para.each{|x| tuned_file << x } else tuned_file << para end tuned_file.flatten! tuned_file.compact! end tuned_file end def numbering_song(data) data=number_plaintext_para(data) data=name_endnote_seg(data) #tr issue data=auto_number_heading_ie_title(data) #tr issue data=ocn(data) #watch data=minor_numbering(data) data=name_para_seg_filename(data) data=set_heading_seg(data) unless @md.set_heading_seg data=set_heading_top(data) unless @md.set_heading_top data=set_header_title(data) unless @md.set_header_title data end def number_plaintext_para(data) @tuned_file=[] data.each do |para| para.gsub!(/(^|[^<][^v][^>])\n/,'\1 ') #messy, but idea is that tables should retain breaks para.gsub!(/^/,"\n") unless para =~/¡/ para.gsub!(/^\s+|\s$/,"\n") @tuned_file << para end @tuned_file=@tuned_file.flatten end def name_endnote_seg(data) @tuned_file=[] data.each do |para| para.gsub!(/<:3>\s*<:ee>/, <<-WOK #{@@endnote['special_align']}


\r #{@@endnote['seg_name_3']}

#{@@endnote['special_align_close']} WOK ) para.gsub!(/<:2>\s*<:ee>/, <<-WOK #{@@endnote['special_align']}


\r #{@@endnote['seg_name_2']}

#{@@endnote['special_align_close']} WOK ) para.gsub!(/<:1>\s*<:ee>/, <<-WOK #{@@endnote['special_align']}


\r #{@@endnote['seg_name_1']}

#{@@endnote['special_align_close']} WOK ) @tuned_file << para end # debug 2003w46 adding revision control info if @md.flag_auto_endnotes \ and @md.flag_separate_endnotes_make @tuned_file << "\n4~endnotes Endnotes <-#> <~0;0:0;u0>" end @tuned_file << "\n" @tuned_file=@tuned_file.flatten end def owner_details_seg data << '4~owner.details Owner Details' end def number_sub_heading(para,num,title_no) case para when /#{num}~- /; para.gsub!(/#{num}~- /,"#{title_no} ") when /^#{num}~#\s*/; para.gsub!(/^#{num}~#\s*/,"#{title_no} ") when /^#{num}~[a-z_\.]+ / para.gsub!(/^#{num}~([a-z_\.]+)\s+(.+)/i,%{#{num}~\\1 #{title_no} \\2 <:name##{title_no}>}) when /^#{num}~\s+#{title_no}/ para.gsub!(/^#{num}~ /,"#{num}~#{title_no} ") #where title contains title number else para.gsub!(/^#{num}~ /,"#{num}~#{title_no} #{title_no} ") #main, where title number is to be provided end if @md.toc_lev_limit \ and @md.toc_lev_limit < num para.gsub!(/^[5-8]~(?:~\S+)?\s*/,'!_ ') end para end def auto_number_heading_ie_title(data) #also does some segment naming @tuned_file=[] if @md.markup =~/num_top/ \ or @md.num_top # watch, 2003w23 input="#{@md.markup}"[/num_top\=([1-6])/,1] if @md.markup input||=@md.num_top if @md.num_top !~/^$/ end num_top=input.to_i t_no1=t_no2=t_no3=t_no4=0 no1=num_top; no2=(num_top + 1); no3=(num_top + 2); no4=(num_top + 3) t_not=0 data.each do |para| #@md.seg_names << [additions to segment names] if (@md.markup =~/num_top/ \ or (@md.num_top \ and @md.num_top !~/^$/)) \ and para !~/^0~/ if (para =~/^(?:#{no1}|^#{no2}|^#{no3}#{no4})~#/ \ and para !~/^4~endnotes?/) t_not+=1 #; t_no2=0; t_no3=0 para.gsub!(/^(#{no1})~#\s*/,"\\1~ps#{t_not} ") para.gsub!(/^(#{no2})~#\s*/,"\\1~ps#{t_not} ") para.gsub!(/^(#{no3})~#\s*/,"\\1~ps#{t_not} ") para.gsub!(/^(#{no4})~#\s*/,"\\1~ps#{t_not} ") end if para =~/#{no1}~/ @subnumber=1 @subnumber=0 if para =~/#{no1}~/ end if para =~/^[0-6]~[ \w-]/ \ and para !~ /(?:[0-6]~[\w-]+-|4~endnotes|^[0-6]~([a-z_\.]+)\s+[\d.]+)\s/ \ and para !~/<~#>|<-#>/ if para =~/^#{no1}~/ t_no1+=1; t_no2=0; t_no3=0 title_no="#{t_no1}" if not @md.seg_names.nil? \ and not @md.seg_names.include?(title_no) para.gsub!(/^#{no1}~\s+(\S+)#/,"#{no1}~#{title_no} \\1 #{title_no} ") #shift placement of auto-number to after first word, e.g. Article # not # Article, added on occasion of ABF (20040329) para.gsub!(/^#{no1}\{\s+(Article|Clause|Section)\s+#/i,%{#{no1}~#{title_no} \\1 #{title_no}. }) unless para =~/^#{no1}~\s+[\d.]+\s/ #fix -> if the title starts with a numbering scheme, do not auto-number, review para.gsub!(/^#{no1}~\s+/,"#{no1}~#{title_no} #{title_no}. ") end @md.seg_names << title_no #else puts "warning segment name #{title_no} already exists" end unless para =~/^#{no1}~([a-z_\.]+)\s+[A-Z]\.?\s/ #bug -> tmp fix, excludes A. B. C. lettering, but not roman numerals, is arbitrary, review required para.gsub!(/^#{no1}~([a-z_\.]+)\s+(.+)/i,%{#{no1}~\\1 #{title_no}. \\2 <:name##{title_no}>}) end para.gsub!(/^#{no1}~#\s*/,"#{title_no}. ") end if para =~/^#{no2}~/ t_no2+=1; t_no3=0 title_no="#{t_no1}.#{t_no2}" para=number_sub_heading(para,no2,title_no) end if para =~/^#{no3}~/ t_no3+=1 title_no="#{t_no1}.#{t_no2}.#{t_no3}" para=number_sub_heading(para,no3,title_no) end elsif para =~ /^[0-6]~[\w-]+-/ # endnotes, watch2005 para.gsub!(/^#{no1}~([a-z_\.]+)- /,"#{no1}~\\1 ") para.gsub!(/^#{no2}~([a-z_\.]+)- /,"#{no2}~\\1 ") para.gsub!(/^#{no3}~([a-z_\.]+)- /,"#{no3}~\\1 ") end elsif @md.markup =~/num_extract/ #AS DANGEROUS force enable with document, note already does this type of numbering for cisg, locate and coordinate logic, is currently misplaced in code, chengwei inspired 2004w23/4 unless para =~ /^[0-6]~\S+/ #endnotes watch? if para =~/^[1-6]~\s+([\d\.]+)/ #risky (must be unique) consider output to 4~~\d instead of 4~\d name_num=$1 para.gsub!(/^([1-6]~)\s+/,"\\1#{name_num} ") end end if @md.toc_lev_limit end end @tuned_file << para end @tuned_file=@tuned_file.flatten end def ocn(data) #and auto segment numbering increment @tuned_file=[] object_array=SiSU_document_structure::OCN.new(@md,data).ocn object_array.each do |o| @tuned_file <<= if o.ocn; "#{o.txt} <~#{o.ocn};#{o.lv};#{o.type}>" else o.txt end end @tuned_file=@tuned_file.flatten end def minor_numbering(data) #and auto segment numbering increment @tuned_file=[] number_small,letter_small=0,0 letter=%w( a b c d e f g h i j k l m n o p q r s t u v w x y z ) data.each do |para| if para =~/\w|\S|<|\(/ if para !~/^%% |^0~|^4~endnotes|^<\/center>|<:ee>|<:e[:_]>|^\^~ |<:e[:_]\d+?>|^<:p[bn]>|^<:\#|<:- |<[:!]!4|^(?:alt|code|group|poem|table)\{|^\}(?:alt|code|group|poem|table)|^\}table$|||||<\/tr>|


|\[endnotes\]|<:zz>|<:isbn-|<:journal-|<:conference-|/i #ocn here #  added with Tune.code #¡ if para=~/^[1-8]~/; number_small,letter_small=0,0 #% sub-number system, (baby numbering) reset with any change of major number (more obviously should be placed in number titles, but that is conditionally executed, check and move later) end if para =~/^#[ 1]/ letter_small=0 number_small=0 if para =~ /^#1/ number_small+=1 para.gsub!(/^#[ 1]/,"#{number_small}. ") #change 2004 end if para =~/^_# / para.gsub!(/^_# /,"<:i1> #{letter[letter_small]}. ") #change 2004 letter_small+=1 end end end @tuned_file << para end @tuned_file=@tuned_file.flatten end def name_para_seg_filename(data) # paragraph name/numbering rules # manual naming overrides, manual naming may be # alpha-numeric characters mixed, # numeric only (a number), if # all segments have been named, # the numbers used are over 1000 or # it is not minded that auto-numbering uses a funny scheme for naming segments (not yet implemented) # [for now a warning is printed for such documents on use of maintenance or very-verbose flag] # auto-naming takes the form of giving numbers to segments # the rules for which are as follows # if the title/heading text starts with a numeric, then that is used (1 3.1 3rd etc.) # otherwise the level 4 segment number from the embedded document structure info is used # if there is none a sequential number is designated, preceded by an underscore @tuned_file=[] art_filename_auto=1 @counter=1 @unique_auto_name=[] if not @md.seg_autoname_safe and @md.cmd =~/[MV]/ puts 'manual segment names, numbers used as names, risk warning (segmented html)' end data.each do |para| para=SiSU_document_structure::Structure.new(@md,para).structure_markup if para =~/^[456]~ / if para=~/^4/ \ and not @md.set_heading_seg @md.set_heading_seg=true end if para =~/^[456]~(?:\s\S+)?\s+([\d.,:-]+)/m #heading starts with a recognised numeric or word followed by a recognised numerical construct, use that as name pattern=$1 pattern.gsub!(/(?:[:,-]|\W)/,'.') pattern.gsub!(/\.$/,'') if not @md.seg_names.nil? \ and not @md.seg_names.include?(pattern) para.gsub!(/^([456])~\s*/,"\\1~#{pattern} ") @md.seg_names << pattern else puts 'warn, there may be a conflicting numbering scheme' if @md.cmd =~/[VM]/ end end if para =~/^4~\s.+?;4:(\d+);/m #extract segment name from embedded document structure info pattern=$1 pattern.gsub!(/(?:[:,-]|\W)/,'.') pattern.gsub!(/\.$/,'') if not @md.seg_names.nil? \ and not @md.seg_names.include?(pattern) para.gsub!(/^(4)~\s*/,"\\1~#{pattern} ") @md.seg_names << pattern else para.gsub!(/^(4)~\s*/,"\\1~~#{pattern} ") @md.seg_names << "~#{pattern}" end end if para =~/^4~\s+/ #if still not segment name, provide a numerical one if not @md.seg_names.nil? \ and not @md.seg_names.include?(art_filename_auto) para.gsub!(/^4~\s+/,%{4~_#{art_filename_auto} }) @md.seg_names << art_filename_auto else puts 'segment name (numbering) error' end art_filename_auto+=1 end end @tuned_file << if para =~/^([1-6])~/m \ and (@md.pagenew \ or @md.pagebreak) m=$1 #watch ref~ para_tmp=[] if @md.pagenew.to_s =~/#{m}/; para_tmp << "<:pn>\n" << para end if @md.pagebreak.to_s =~/#{m}/; para_tmp << "<:pb>\n" << para end para_result=unless para_tmp.length > 0; para else para_tmp end else para end end if @md.seg_names.length > 0 @md.set_heading_seg=true end @tuned_file=@tuned_file.flatten end def set_heading_top(data) #% make sure no false positives unless @md.set_heading_top puts "\tdocument contains no top level heading, (will have to manufacture one)" if @md.cmd =~/[MV]/ @tuned_file=[] data.each do |para| unless @md.set_heading_top if para !~/^(?:@\S+:|0~\S+)\s/m \ and para !~/\A\s*\Z/m @md.set_heading_top=true head=if @md.title ; "1~ #{@md.title}" else '1~ [no title provided]' end @tuned_file << head end end @tuned_file << para end @tuned_file=@tuned_file.flatten end end def set_heading_seg(data) #% make sure no false positives unless @md.set_heading_seg puts "\tdocument contains no segment level, (will have to manufacture one)" if @md.cmd =~/[MV]/ @tuned_file=[] data.each do |para| unless @md.set_heading_seg if para !~/^(?:@\S+:|0~\S+|[123]~)/m \ and para !~/\A\s*\Z/m \ and para !~/<:p[bn]>/ @md.set_heading_seg=true head=if @md.title ; "4~seg [#{@md.title}]" else '4~seg [segment]' end @tuned_file << head end end @tuned_file << para end @tuned_file=@tuned_file.flatten end end def set_header_title(data) #% make sure no false positives unless @md.set_header_title puts "\t no document title provided, (will have to manufacture one)" if @md.cmd =~/[MV]/ @tuned_file=[] data.each do |para| unless @md.set_header_title if para !~/^%{1,2}\s/m \ and para !~/\A\s*\Z/m @tuned_file << "0~title #{@md.heading_seg_first}" @md.title=@md.heading_seg_first @md.set_header_title=true end end @tuned_file << para end @tuned_file=@tuned_file.flatten end end def endnotes(data) @tuned_file=[] endnote_no,endnote_ref=1,1 #% endnote work zone data.each do |para| # manually numbered endnotes --> if @md.mod.inspect =~/--no-asterisk|--no-annotate/ para.gsub!(/~\[[*]\s.+?\]~/,'') end if @md.mod.inspect =~/--no-dagger|--no-annotate/ para.gsub!(/~\[[+]\s.+?\]~/,'') end unless para =~/^<:code>/ case para # auto-numbered endnotes --> when /~\{\s+.+?\}~|~\[[*+]\s+.+?\]~/ para.gsub!(/\s*(\}~|\]~)/,' \1') # required 2003w31 word_mode=para.scan(/<:group>\n|\n<:group-end>|\S+/m) word_mode=endnote_call_number(word_mode) para=word_mode.join(' ') endnote_ref+=1 when /~\^(?:\s|$)|<:e>/ #%Note inserts endnotes previously gathered from /^(|[-~]\{{3})/ (in earlier loop) word_mode=para.scan(/<:group>\n|\n<:group-end>|\S+/m) word_mode=endnote_call_number(word_mode) para=word_mode.join(' ') endnote_ref+=1 end end @tuned_file << para end @tuned_file=@tuned_file.flatten end def endnote_call_number(data) data.each do |word| unless data =~/^<:code>/ case word when /~\{/ unless word =~/~\{[*+]+/ word.gsub!(/~\{/,"~\{#{@@endnote_counter} ") @@endnote_counter+=1 end when /~\[/ if word =~/~\[[+]/ word.gsub!(/~\[[+]/,"~\[\+#{@@endnote_counter_dag} ") @@endnote_counter_dag+=1 else word.gsub!(/~\[[*]?/,"~\[\*#{@@endnote_counter_asterisk} ") @@endnote_counter_asterisk+=1 end when /~\^|<:e>/ word.gsub!(/~\^|<:e>/,"#{@@endnote_array[@@endnote_counter-1]}") @@endnote_counter+=1 end end end end def metadata(data) meta,@dc,@rc,@cvs,dctitle,add=Array.new(6){[]} dir=SiSU_Env::Info_env.new(@md.fns) base_html="#{dir.url.root}/#{@md.fnb}" ocnm=ocnd=ocnv=0 ocnm+=1 header0='<:pn>' header1="\n1~meta Document Information (metadata) <~0;0:0;m#{ocnm}>" ocnm+=1 header4="\n4~metadata Metadata <~0;m#{ocnm};m#{ocnm}>" ocnm+=1; ocnd+=1 head_no_dc="<~0;m#{ocnm};d#{ocnd}>" ocnm+=1; ocnd+=1 head_no_dc_tag="<~0;m#{ocnm};d#{ocnd}>" data.each do |para| case para when /^0~(title|creator|author|translator|translated_by|illustrator|illustrated_by|prepared_by|digitized_by|description|publisher|contributor|date\.created|date\.issued|date\.available|date\.valid|date\.modified|date|type|format|rights|identifier|source|language)/i m=$1 ocnm+=1; ocnd+=1 @dc << case para when /^0~title/ "\n#{@tr.dc_title}: #{@md.dc_title} <~0;m#{ocnm};d#{ocnd}>" when /^0~(?:creator|author)/ "\n#{@tr.creator}: #{@md.dc_creator} <~0;m#{ocnm};d#{ocnd}>" when /0~(?:translator|translated_by)/ "\n#{@tr.translator}: #{@md.translator} <~0;m#{ocnm};d#{ocnd}>" when /^0~(?:illustrator|illustrated_by)/ "\n#{@tr.illustrator}: #{@md.illustrator} <~0;m#{ocnm};d#{ocnd}>" when /^0~prepared_by/ "\n#{@tr.prepared_by}: #{@md.prepared_by} <~0;m#{ocnm};d#{ocnd}>" when /^0~digitized_by/ "\n#{@tr.digitized_by}: #{@md.digitized_by} <~0;m#{ocnm};d#{ocnd}>" when /^0~description/ "\n#{@tr.description}: #{@md.dc_description} <~0;m#{ocnm};d#{ocnd}>" when /^0~subject/ "\n#{@tr.subject}: #{@md.dc_subject} <~0;m#{ocnm};d#{ocnd}>" when /^0~abstract/ "\n#{@tr.abstract}: #{@md.dc_abstract} <~0;m#{ocnm};d#{ocnd}>" when /^0~publisher/ "\n#{@tr.publisher}: #{@md.dc_publisher} <~0;m#{ocnm};d#{ocnd}>" when /^0~contributor/ "\n#{@tr.contributor}: #{@md.dc_contributor} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.created/ "\n#{@tr.date_created}: #{@md.dc_date_created} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.issued/ "\n#{@tr.date_issued}: #{@md.dc_date_issued} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.available/ "\n#{@tr.date_available}: #{@md.dc_date_available} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.modified/ "\n#{@tr.date_modified}: #{@md.dc_date_modified} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.valid/ "\n#{@tr.date_valid}: #{@md.dc_date_valid} <~0;m#{ocnm};d#{ocnd}>" when /^0~date/ "\n#{@tr.date}: #{@md.dc_date} <~0;m#{ocnm};d#{ocnd}>" when /^0~type/ "\n#{@tr.type}: #{@md.dc_type} <~0;m#{ocnm};d#{ocnd}>" when /^0~format/ "\n#{@tr.format}: #{@md.dc_format} <~0;m#{ocnm};d#{ocnd}>" when /^0~rights/ "\n#{@tr.rights}: #{@md.dc_rights} <~0;m#{ocnm};d#{ocnd}>" when /^0~identifier/ "\n#{@tr.identifier}: #{@md.dc_identifier} <~0;m#{ocnm};d#{ocnd}>" when /^0~source/ "\n#{@tr.source}: #{@md.dc_source} <~0;m#{ocnm};d#{ocnd}>" when /^0~language/ "\n#{@tr.language}: #{@md.dc_language} <~0;m#{ocnm};d#{ocnd}>" when /^0~language.original/ "\n#{@tr.language_original}: #{@md.language_original} <~0;m#{ocnm};d#{ocnd}>" when /^0~relation/ "\n#{@tr.relation}: #{@md.dc_relation} <~0;m#{ocnm};d#{ocnd}>" when /^0~coverage/ "\n#{@tr.coverage}: #{@md.dc_coverage} <~0;m#{ocnm};d#{ocnd}>" when /^0~keywords/ "\n#{@tr.keywords}: #{@md.keywords} <~0;m#{ocnm};d#{ocnd}>" when /^0~comments/ "\n#{@tr.comments}: #{@md.comments} <~0;m#{ocnm};d#{ocnd}>" when /^0~cls_loc/ "\n#{@cls_dewey}: #{@md.cls_dewey} <~0;m#{ocnm};d#{ocnd}>" when /^0~cls_dewey/ "\n#{@tr.cls_dewey}: #{@md.cls_dewey} <~0;m#{ocnm};d#{ocnd}>" when /^0~cls_gutenberg|0~cls_pg/ "\n#{@tr.cls_gutenberg}: #{@md.cls_gutenberg} <~0;m#{ocnm};d#{ocnd}>" #"\n#{@tr.cls_gutenberg}: #{@md.cls_pg} <~0;m#{ocnm};d#{ocnd}>" when /^0~cls_isbn/ "\n#{@tr.cls_isbn}: #{@md.cls_isbn} <~0;m#{ocnm};d#{ocnd}>" when /^0~prefix(?:_a)?/ "\n#{@tr.prefix_a}: #{@md.prefix_a} <~0;m#{ocnm};d#{ocnd}>" when /^0~prefix_b/ "\n#{@tr.prefix_b}: #{@md.prefix_b} <~0;m#{ocnm};d#{ocnd}>" else para.gsub(/^0~(#{m})\s+(.+)/m,"\n#{m.capitalize}: \\2 <~0;m#{ocnm};d#{ocnd}>") end end end ocnm+=1; ocnv+=1 head_no_rc="<~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 head_no_rc_tag="<~0;m#{ocnm};v#{ocnv}>" data.each do |para| case para when /^0~(?:cvs|rcs)\+\s+/ #note the + sign to turn on use of cvs id ocnm+=1; ocnv+=1 @cvs << "#{@tr.sc_number}: #{@md.sc_number} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 @cvs << "#{@tr.sc_date}: #{@md.sc_date} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 @cvs << "CVS/RCS time: #{@md.sc_time} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 when /^0~cvs[+\s]/ #enable pattern above instead if you wish the default to be to include cvs tags from all documents KEEP when /^0~cvs\s+/ #enable pattern above instead if you wish the default to be to include cvs tags from all documents KEEP end end if true #default version information ocnm+=1; ocnv+=1 if @md.sc_filename \ and @md.sc_filename.length > 3 @rc << "#{@tr.sourcefile}: #{@md.sc_filename} <~0;m#{ocnm};v#{ocnv}>" else @rc << "#{@tr.sourcefile}: #{@md.fns} <~0;m#{ocnm};v#{ocnv}>" end ocnm+=1; ocnv+=1 if @md.file_encoding \ and @md.file_encoding.length > 3 #translate @rc << "Filetype: #{@md.file_encoding} <~0;m#{ocnm};v#{ocnv}>" end ocnm+=1; ocnv+=1 if @md.dgst #change. enable by default @rc << "#{@tr.sourcefile_digest}, #{@md.dgst[0]} #{@md.dgst[1]} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 end if @md.dgst_skin #change. enable by default @rc << "Skin_Digest: #{@md.dgst_skin[0]} #{@md.dgst_skin[1]} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 end @rc << "Generated #{head_no_rc}" if @rc.length > 0 @rc << "#{@tr.last_generated}: #{Time.now} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 if @md.sisu_version[:version] @rc << "#{@tr.sisu_version}: #{@md.sisu_version[:project]} #{@md.sisu_version[:version]} of #{@md.sisu_version[:date_stamp]} (#{@md.sisu_version[:date]}) <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 end @rc << "#{@tr.ruby_version}: #{@md.ruby_version} <~0;m#{ocnm};v#{ocnv}>" end meta << header0 meta << header1 meta << header4 meta << "Document Manifest @\n #{base_html}/#{@md.fn[:manifest]} <~0;m#{ocnm};m#{ocnm}>" meta << "Dublin Core (DC) #{head_no_dc}" if @dc.length > 0 meta << "DC tags included with this document are provided here. #{head_no_dc_tag}" if @dc.length > 0 @dc.each { |x| meta << x } meta << "Version Information #{head_no_rc}" if @rc.length > 0 if @cvs.length > 0 meta << "Note the version information provided here, is specific to the host site. #{head_no_rc_tag}" @cvs.each { |x| meta << x } end @rc.each { |x| meta << x } ## ENDNOTE RELATED endnote related meta << "\n" meta=object_digest(meta) end def stamped(para,hash_class) @tuned=[] para=strip_clean_extra_spaces(para) digest_all=hash_class.hexdigest(para) # print "#{hash_class.name}: "; puts digest_all #length==32 or 64 stripped=strip_clean_of_markup(para) digest_strip=hash_class.hexdigest(stripped) unless para =~/<:code>/ case para when /~\{[\d*+]+\s+.+?\}~|~\[[*+]\d+\s+.+?\]~/m en_and_para,en_and_para_digest=[],[] para.gsub!(/\s*(\}~|\]~)/m,' \1') #watch para_plus_en=para.scan(/.*?~\{.+?\}~|.*?~\[.+?\]~/m) para_tail=if para =~/(?:.*?~\{.+?\}~|.*?~\[.+?\]~)+([\s\S]+)/m /(?:.*?~\{.+?\}~|.*?~\[.+?\]~)+(.+?<~\d+;(?:\w|[0-6]:)\d+;\w\d+>)/m.match(para)[1] else '' end para_plus_en << para_tail en_and_para_digest << endnote_digest(para_plus_en) para_new=en_and_para_digest.join(' ') @tuned << para_new + '<' + digest_strip + ':' + digest_all + '>' unless para.nil? else @tuned << para + '<' + digest_strip + ':' + digest_all + '>' unless para.nil? end else @tuned << para + '<' + digest_strip + ':' + digest_all + '>' unless para.nil? end @tuned.join end def object_digest(data) # 1. clean/stripped text without any markup, paragraph, headings etc. without endnotes # 2. endnotes clean/stripped text digest only (there may be several endnotes within a paragraph) # 3. whole object, text with markup and any endnotes, (question: with or without the endnote digests??? presumption better without, [however may be easier to check with?]) # [digests should not include other digests] # vim==/<[0-9a-f]\{#{@@dl}\}\(:[0-9a-f]\{#{@@dl}\}\)\?>/ require 'digest/md5' require 'digest/sha2' @tuned_file=[] data.compact! data.each do |para| para.strip! if para=~/<~\d+;(?:\w|[0-6]:)\d+;\w\d+>/ if @env.digest.type =~/sha256/ for hash_class in [ Digest::SHA256 ] @tuned_file << stamped(para,hash_class) end else for hash_class in [ Digest::MD5 ] @tuned_file << stamped(para,hash_class) end end else @tuned_file << para unless para.nil? end end @tuned_file=@tuned_file.flatten #use md5 or to create hash of each dal object including ocn, & add into to each dal object end def endnote_digest(data) para_bit=[] data.each do |en_plus| para_bit <<= case en_plus when /~\{|~\[/ if en_plus =~/~\{.+?\}~|~\[.+?\]~/ para_txt,en_open,en_txt,en_close=/(.*?)(~\{|~\[)(.+?)(\}~|\]~)/m.match(en_plus)[1..4] stripped_en=strip_clean_of_markup(en_txt) if @env.digest.type =~/sha256/ digest_en_strip=Digest::SHA256.hexdigest(stripped_en) else digest_en_strip=Digest::MD5.hexdigest(stripped_en) end para_txt + en_open + en_txt + '<' + digest_en_strip + '>' + en_close else puts "Error Exception - problem encountered with:\n#{en_plus}" #arbitrary exception, tidy up end else en_plus end end para_bit.join end def strip_clean_extra_spaces(s) # dal output tuned s=s.dup s=s.gsub(/[ ]+([,.;:?](?:$|\s))/,'\1') s=s.gsub(/ [ ]+/,' ') s=s.gsub(/^ [ ]+/,'') s=s.gsub(/ [ ]+$/,'') s=s.gsub(/(<\/[bi]>')[ ]+(s )/,'\1\2') end def strip_clean_of_markup(s) # used for digest, define rules, make same as in db clean #consider: <\/?[ib]>|<(?:\/ )?br>|(.+?)<\/del> s=s.dup s=s.gsub(/(?:<\/?[ib]>|<~\d+;(?:\w|[0-6]:)\d+;\w\d+>|<#@dp:#@dp>|^[1-6]~\S+|~\{\d+\s.+?\}~)/,'') # markup and endnotes removed #% same as db clean --> s=s.gsub(/(.+?)<\/del>/,'DELETED(\1)') # deletions s=s.gsub(/(\d+)<\/sup>/,'[\1]') s=s.gsub(/(?: \\;)+/,' ') #s=s.gsub(//,"[TABLE]\n") # tables #s=s.gsub(//,'\1') # tables #s=s.gsub(/¡¡\d+¡/,' ') # tables #s=s.gsub(/¡/,' ') # tables tidy later #s=s.gsub(/<.+?>/,'') s=s.gsub(/\{.+?\.(?:png|jpg|gif).+?\}(?:https?|file|ftp)\\\:\S+ /,' [image] ') # else image names found in search s=s.gsub(/\s\s+/,' ') s=s.strip end end end __END__ dal output, rules to simplify parsing nodes === objects === paragraphs === text blocks separated by \n\n dal output: :verse :group and :code have -end :table is not used