#!/usr/bin/env ruby # = sisu - SiSU information Structuring Universe # # Copyright (c) Ralph Amissah 1997,2004 # # Ralph Amissah mailto:ralph@amissah.com # # * Name: SiSU information Structuring Universe # * Author: Ralph@Amissah.com # * Description: document conversion tool, to sisu from other formats # * arch-tag: document conversion tool to sisu markup # * $Date: 2004/10/16 15:51:06 $ # * $Id: sisu_convert,v 1.37 2004/10/16 15:51:06 ralph Exp $ # * License: GPL 2 or later # * Notes: word conversion uses wvWare and wvSiSU.xml (a modified/stripped wvHtml.xml) # * http://wvware.sourceforge.net/ # * http://sourceforge.net/projects/wvware # * |sisu.lnk|@|^| # * # * |zxy_param.rb|@|^| module CONVERT class MyOutput def initialize(data, filename, instruct) @data=data.compact @filename=filename @instruct=instruct end def headerBasic <\n" #: <<#{@@html_title}>> data=WareWord97.new(data.collect, @filename, @instruct).strip data=WareWord97.new(data.collect, @filename, @instruct).strip data=WareWord97.new(data.collect, @filename, @instruct).markup_rules data=MyOutput.new(data.collect, @filename, @instruct).hardOutput end def strip data=@data tuned_file=Array.new endnote_no=1 data.each do |para| para.strip! para.gsub!(/\s*<\/u>/, '') para.gsub!(/<\/u>\s*/, '') para.gsub!(/\s*<\/b>/, '') para.gsub!(/<\/b>\s*/, '') para.gsub!(/\s*<\/i>/, '') para.gsub!(/<\/i>\s*/, '') tuned_file << para unless para == nil end tuned_file end def markup_rules data=@data tuned_file=Array.new endnote_no=1 data.each do |para| para.strip! para.gsub!(/\s+/, ' ') para.gsub!(/^(Chapter|Article)(.+?)<\/b>/i, "4{ \\1 \\2") #watch case insensitivity para.gsub!(/^(Part|Section|Book)(.+?)<\/b>/i, "3{ \\1 \\2") #watch case insensitivity para.gsub!(/^(\d+\.\d+\.\d+)(.+?)<\/b>/i, "6{ \\1 \\2") #numeric, decide what to do, can be different para.gsub!(/^(\d+\.\d+)(.+?)<\/b>/i, "5{ \\1 \\2") #numeric, decide what to do, can be different para.gsub!(/^(\d+)(.+?)<\/b>/i, "4{ \\1 \\2") #numeric, decide what to do, can be different para.gsub!(/(.+?)<\/u>/, "_{\\1}_") para.gsub!(/(.+?)<\/b>/, "*{\\1}*") para.gsub!(/(.+?)<\/i>/, "/{\\1}/") tuned_file << para unless para == nil end tuned_file end end class Html def initialize(data, filename, instruct) @data=data @filename=filename @instruct=instruct end def songsheet data=@data print "Convert to SiSU file from #{@filename}.html << gvim ,,#{@filename}.er9 >\n" #: <<#{@@html_title}>> #data=Html.new(data.collect, @filename, @instruct).space_paragraphs #data=Html.new(data.split(''), @filename, @instruct).space_paragraphs data=Html.new(data.split("\n"), @filename, @instruct).space_paragraphs #data=Html.new(data.collect.join.split("\n"), @filename, @instruct).space_paragraphs data=Html.new(data.collect, @filename, @instruct).multiline data=Html.new(data.collect.join.split("\n\n"), @filename, @instruct).markup_rules data=MyOutput.new(data.collect, @filename, @instruct).hardOutput end def space_paragraphs #data=@data.join.split(/\n/) data=@data #p data.length tuned_file=Array.new data.each do |para| para.strip! para.gsub!(/\r/, '') #para.gsub!(/\n/, ' ') #PROBLEM, serious time issues on a few files also for \n (or multiline matches which is less surprising), edit out if necessary para.gsub!(/<\/?p>/i, 'zZz') para.gsub!(/<\/?\s*p(?:\s+ALIGN=.+?)?>/i, "zZz") #all manner of

para.gsub!(/<\/?p>/i, "\n\n") para.gsub!(//i, "zZz") # para.gsub!(/<\/p>/i, "zZz") # repeat actually para.gsub!(/<(?:dir|tr|br)>/i, "zZz") # #para.gsub!(/<(?:\/\s*)?(?:dir|tr|br)>/i, "zZz") # para.gsub!(/(<\/center>)/i, "\\1zZz") para.gsub!(/(<\/h[1-6]>)/i, "\\1zZz") para.gsub!(/ \s+/i, ' ') para.gsub!(/(?:\s*zZz\s*)+/i, "zZz") # tuned_file << para unless para == nil end tuned_file end def blockquotes(sub='') # SERIOUS PROBLEM INTRODUCED, some blockquotes go missing !, quite unacceptable, debug, for now not used res=Array.new sub.each do |x| if x=~/(<\/blockquote>)/i m = $1 res << x[/(.+?)#{m}/mi, 1].gsub!(/zZz/,"zZz_1 ") if x =~/.+?#{m}/mi res << x[/#{m}(.+)/mi, 1] else res << x #[/(.+)/mi, 1] end end res.join end def multiline data=@data tuned_file=Array.new data.each do |para| para.gsub!(/\n/, ' ') para.gsub!(/ \s+/mi, ' ') #ALL HERE could be very time EXPENSIVE but tamed? compromise ... /mi para.gsub!(/<([biu]|h[1-6])>(?:zZz)?([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3") para.gsub!(/<([biu]|h[1-6])>(?:

|zZz)+(.+?)(?:<\/center>)?zZz(.+?)?<\/\1>/i, "zZz<\\1>\\2 \\3") #para.gsub!(/<([biu]|h[1-6])>(?:
|zZz)+(.+?)<\/center>zZz(.+?)?<\/\1>/i, "zZz<\\1>\\2 \\3") para.gsub!(/<([biu]|h[1-6])>(?:
|zZz)+(.+?)<\/\1>/i, "zZz<\\1>\\2") para.gsub!(/<(h[1-6])>(.+?)(?:
|zZz)+<\/\1>/i, "zZz<\\1>\\2zZz") #does catch some h1, h2 etc, too expensive to have biu #para.gsub!(/<([biu]|h[1-6])>(.+?)(?:
|zZz)+<\/\1>/i, "zZz<\\1>\\2 \\3") #may go too far? useful for h1 h2 etc, remove biu? #para.gsub!(/<([biu]|h[1-6])>([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3") #para.gsub!(/<([biu]|h[1-6])>([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3") ### SERIOUS PROBLEM INTRODUCED # sub = para.split(/
/i) # para = blockquotes(sub) if sub.length > 0 #check was on >1 could have serious repercussions 2004w29 para.gsub!(/zZz(\s*zZz)*/, "\n\n") tuned_file << para << "\n\n" unless para == nil end tuned_file end def markup_rules data=@data tuned_file=Array.new data.each do |para| if para=~//i #p para.grep(//i) #m=$1 #para.gsub!(/(?:<\s*)?#{m}<\/a>(?:\s*>)?\.?/i, "#{m}") para.gsub!(/(?:<\s*)?http:\/\/.+?<\/a>(?:\s*>)?\.?/i, "\\1") #risk that url & url are not to match #para.gsub!(/(?:<\s*)?(http:\/\/.+?\/\1)<\/a>(?:\s*>)?\.?/i, "\\2") #does not match end ### clean para.gsub!(/^\s+/i, '') para.gsub!(/<([bui]|em|su[pb])>\s*<\/\1>/i, '') para.gsub!(/<\/?center>/i, '') para.gsub!(/\s*<\/dir>/i, '') para.gsub!(/
/i, '') para.gsub!(/\s*
\[(\*+)\]<\/a>/i, "^{[\\1]}^ ") #other endnote marker para.gsub!(/\[[a-z]?\d+\](?:<\/[bi]>)?<\/a>/i, '~e ') #endnote marker para.gsub!(/\[[a-z]?\d+\](?:<\/[bi]>)?<\/a>/i, '~e ') #endnote marker para.gsub!(/\s*(<\/a>)?\s*\d+\.?\s*(<\/a>)?\s*/i, '~{{ ') #endnote #para.gsub!(/\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") # para.gsub!(/\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") # para.gsub!(/^(Chapter|Article)(.+?)<\/b>/i, "4{ \\1 \\2") #watch case insensitivity para.gsub!(/^(Part|Section|Book)(.+?)<\/b>/i, "3{ \\1 \\2") #watch case insensitivity para.gsub!(/^(\d+\.\d+\.\d+)(.+?)<\/b>/i, "6{ \\1 \\2") #numeric, decide what to do, can be different para.gsub!(/^(\d+\.\d+)(.+?)<\/b>/i, "5{ \\1 \\2") #numeric, decide what to do, can be different para.gsub!(/^(\d+)(.+?)<\/b>/i, "4{ \\1 \\2") #numeric, decide what to do, can be different # para.gsub!(/^()(?:)?<(?:b|strong)>\s*(.+?)\s*<\/(?:b|strong)>/i, "5{ \\2 \\1") #watch para.gsub!(/^(<(a name|A NAME)=".+?">)(\s*|<\/[aA]>)?([A-Z][A-Z])+/, "5{ \\2 \\1") #watch para.gsub!(/^(\s+|

)?()(\s*|<\/a>)?/i, "5{ \\2 \\1") #watch para.gsub!(/\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") # para.gsub!(/^\s*(.+?)<\/b>\s*(<\/i>\s*)?$/i, "4{ \\1\\2") # wish it all were less messy para.gsub!(/^\s*([^"(].+?)<\/i>\s*(<\/b>\s*)?$/i, "5{ \\1\\2") # wish it all were less messy para.gsub!(/<\/?[biu]>/i, '') if para =~/[1-6]\{/ para.gsub!(/\s*(.+?)\s*<\/u>/i, "_{\\1}_") para.gsub!(/<(b|strong)>\s*(.+?)\s*<\/\1>/i, "*{\\2}*") para.gsub!(/<(i|em)>\s*(.+?)\s*<\/\1>/i, "/{\\2}/") para.gsub!(/\s*(.+?)\s*<\/sup>/i, "^{\\1}^") para.gsub!(/(([\/\*!_])\{.+?\}\2)\s\s+/i, "\\1 ") para.gsub!(/(([\/\*!_])\{.+?\}\2)\s+([.,;?\)])\s+/i, "\\1\\3 ") para.gsub!(/(([\/\*!_])\{.+?\}\2)(["'])\s+/i, "\\1\\3 ") para.gsub!(/(([\/\*!_])\{.+?\}\2)\s*([a-z0-9])/i, "\\1 \\3") para.gsub!(/(([\/\*_])\{.+?\}\2)\s*([a-z0-9])/i, "\\1 \\3") para.gsub!(/([a-z0-9])(([\/\*_])\{.+?\}\3)/i, " \\1 \\2") #eg this/{problem}/ para.gsub!(/([\/\*_])\{([,.;; ]+)\}\1/i, "\\2") #eg /{,}/ or *{ }* etc. para.gsub!(/ \s+/i, ' ') #para.gsub!(/\/\{\*\{/i, '*{/{') #para.gsub!(/\}\*\}\//i, '}/}*') para.gsub!(/"/i, '"') para.gsub!(/&/i, 'and') para.gsub!(//i, '') para.gsub!(/<\/(?:title)>/i, '') para.gsub!(//i, '#{~title? ') para.gsub!(/<blockquote>(.+?)<\/blockquote>/mi, "\n\n_1 \\1\n\n") para.gsub!(/<div align=.+?>|<\/div>|<font size=.+?>|<\/a><\/em><\/strong>/i, '') para.gsub!(/~e\s+\.\s*/i, ".~e ") #check vim equiv # %s/\~e\s\+\.\s*/.\~e /c para.gsub!(/\s+~e\s+/i, "~e ") para.gsub!(/ \s+/i, ' ') para.gsub!(/\s+$/i, '') para.gsub!(/^(?:<\/[bi]>)+$/i, '') para.gsub!(/^(?:(?:<i>)+<b>|(?:<b>)+<i>)\s*([^"(].+?)/i, "5{ \\1\\2") # wish it all were less messy para.gsub!(/^(?:<\/?(?:[ib]|em)>\s*)+$/i, '') # cleaning up left over <i> etc. para.gsub!(/<(?:i|em)>\s*(.+)/i, "/{\\1}/") # using up left over <i> para.gsub!(/<b>\s*(.+)/i, "*{\\1}*") # using up left over <b> #para.gsub!(/^(?:<(?:\/)?[bi]>)+$/i, '') tuned_file << para unless para == nil end tuned_file end end class Default < Html def initialize(data, filename, instruct) @data=data @filename=filename @instruct=instruct end def songsheet data=@data print "Convert to SiSU file from #{@filename}.html << gvim ,,#{@filename}.er9 >\n" #: <<#{@@html_title}>> data=Default.new(data.collect, @filename, @instruct).space_paragraphs data=Default.new(data.collect, @filename, @instruct).multiline data=Default.new(data.collect.join.split("\n\n"), @filename, @instruct).markup_rules data=Default.new(data.collect, @filename, @instruct).markup_default data=MyOutput.new(data.collect, @filename, @instruct).hardOutput end def markup_default data=@data tuned_file=Array.new data.each do |para| para.gsub!(/<i>(Id\.?)(\s|$)/i, "/\{\\1\}\\2/") para.gsub!(/^(~\{\{ .+?)(<\/LI>\s*|<\/OL>\s*)+$/i, "\\1") para.gsub!(/\/\{Id\.\s*<\/LI>\s*\}\//i, '/{Id.}/') tuned_file << para unless para == nil end tuned_file end end end def help puts <<WOK conversion program initial SiSU markup from other file formats zxy_convert --word does initial conversion from word97 to sisu markup, expects [filename].doc (can also use --doc) zxy_convert --html does initial conversion from html to sisu markup, expects [filename].html zxy_convert --default does initial conversion from defalt html to sisu markup, expects [filename].html WOK end def doWord(argv, instruct) argv.each do |f| if f =~/.+?\.doc$/ @argv << f[/(.+?)\.doc$/, 1] else print "not .doc? << #{f} >> " end end @argv.each do |filename| system(%{wvWare -x #{@dir.home}/.sisu/convert/wvSiSU.xml #{filename}.doc > #{filename}.wv}) file_array=IO.readlines("#{filename}.wv", "") CONVERT::WareWord97.new(file_array, filename, instruct).songsheet # metaverse created here end end def doHtml(argv, instruct) argv.each do |f| if f =~/.+?\.html$/ @argv << f[/(.+?)\.html$/, 1] else print "not .html? << #{f} >> " end end @argv.each do |filename| file_array=IO.readlines("#{filename}.html", "\n\r") CONVERT::Html.new(file_array, filename, instruct).songsheet # metaverse created here end end def doDefault(argv, instruct) argv.each do |f| if f =~/.+?\.html$/ @argv << f[/(.+?)\.html$/, 1] else print "not .html? << #{f} >> " end end @argv.each do |filename| file_array=IO.readlines("#{filename}.html", "\n\r") CONVERT::Default.new(file_array, filename, instruct).songsheet # metaverse created here end end def cases(argv, instruct) case instruct when/^--(word(97)?|doc)$/i #creates minimal sisu_small.gz package to send doWord(argv, instruct) when/^--(html)$/i #creates sisu.gz package to send doHtml(argv, instruct) when/^--(default)$/i #creates sisu.gz package to send doDefault(argv, instruct) else help end end require 'zxy_sysenv.rb' include SiSU_Env @dir=SiSU_Env::Info_dir.new @argv=Array.new argv=$* instruct = "#{argv[0].to_s}" argv.shift instruct.chomp! instruct = "help" if instruct.nil? or instruct == ""; cases(argv, instruct)