From fdd1489c82a274615e46e3c67fc5707e3fb0465f Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Mon, 9 Jul 2007 11:19:48 +0100 Subject: improved url matching, and texpdf tolerance and indentation levels set --- CHANGELOG | 20 ++++++++++++++++++ lib/sisu/v0/dal_syntax.rb | 8 +++++-- lib/sisu/v0/html_tune.rb | 18 ++++------------ lib/sisu/v0/odf.rb | 12 +++++++++-- lib/sisu/v0/plaintext.rb | 3 ++- lib/sisu/v0/shared_html_lite.rb | 17 +++------------ lib/sisu/v0/shared_xml.rb | 11 ++++++---- lib/sisu/v0/texinfo_format.rb | 7 ++++--- lib/sisu/v0/texpdf.rb | 1 + lib/sisu/v0/texpdf_format.rb | 46 ++++++++++++++++++++--------------------- 10 files changed, 79 insertions(+), 64 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a0419694..ae4d6eec 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,26 @@ Reverse Chronological: %% STABLE MANIFEST +%% sisu_0.55.3.orig.tar.gz (2007-07-07:27/6) +http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.3.orig.tar.gz + sisu_0.55.3.orig.tar.gz + sisu_0.55.3-1.dsc + sisu_0.55.3-1.diff.gz + + * url + * matching improved + (plaintext, html, xml, odf, tex/pdf, db) + * possibility to escape decoration + with \http://url or _http://url becomes http://url + + * texpdf + * TeX line breaking tolerance altered, to deal with urls + better + * indentation levels reset + + FIX [* texpdf, indent levels, level 1 goes in to far, level 2 not + differentiated!] FIX + %% sisu_0.55.2.orig.tar.gz (2007-07-07:27/6) http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.2.orig.tar.gz 510b2648ada09f60241f0a6f7cb8d180 1265392 sisu_0.55.2.orig.tar.gz diff --git a/lib/sisu/v0/dal_syntax.rb b/lib/sisu/v0/dal_syntax.rb index 42e911e5..a882a2c9 100644 --- a/lib/sisu/v0/dal_syntax.rb +++ b/lib/sisu/v0/dal_syntax.rb @@ -191,7 +191,7 @@ module Syntax # # #numbered (list) level 1 # _# #numbered (list) level 2 line=line.dup - unless line =~/^0~|<:codeline>|<:code-end>/ + if line !~/^0~|<:codeline>|<:code-end>/ #special characters: ~ { } < > - _ / also used : ^ ! # line_array=[] line.gsub!(/^%{1,4} .+/mi,'') #remove comments @@ -282,7 +282,9 @@ module Syntax if line =~/(<:(?:verse|group)>)/; line.gsub!(/(<:(?:verse|group)>)/i,"\\1\n") #cosmetic else line.gsub!(/(
)/i,"\\1\n") end - else + else #code blocks + line.gsub!(/(^|\s)(http:\/\/\S+)/,'\1_\2') #line.gsub!(/(^|\s)(http:\/\/\S+)/,"\\1\\\\\\2") #escape urls + line.gsub!(/(^|\s)<(http:\/\/\S+)>([\s,.]|$)/,'\1\2\3') #clean/unescape urls with decoration, re-apply decoration later line.gsub!(/<:codeline>/,"\n") end line @@ -337,3 +339,5 @@ module Syntax end end __END__ +NOTE: +downstream code blocks are not currently/yet honoured, e.g. stuff within angle brackets are removed diff --git a/lib/sisu/v0/html_tune.rb b/lib/sisu/v0/html_tune.rb index 1fb282de..1d3461c3 100644 --- a/lib/sisu/v0/html_tune.rb +++ b/lib/sisu/v0/html_tune.rb @@ -196,6 +196,7 @@ module SiSU_Tune @sys=SiSU_Env::System_call.new @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern @env=SiSU_Env::Info_env.new(@md.fns) + @url_brace=SiSU_Viz::Skin.new.url_decoration #@utf8=SiSU_character_encode::UTF8 #.new end def songsheet @@ -324,20 +325,9 @@ module SiSU_Tune if (para =~/\b\S+\@\S+?\.\S+/ and para !~/(\"\S+\@\S+?\.\S+\"|>\S+\@\S+?\.\S+?<)/) para.gsub!(/\b(\S+\@\S+?\.\S+)(\s)/,'<\1>\2') end - if (para !~/(\"\w+:\/\/\S+?\"|>\s*\w+:\/\/\w+?\S*<)/) #url markup http etc. - if para=~/\w+:\/\/\S+?\.\S+?[.,] / - para.gsub!(/(\w+:\/\/\S+?\.\S+?)([.,] )/,'<\1>\2') #full stops ! have been a bother - else - para.gsub!(/(\w+:\/\/\S+?\.\S+)/, '<\1>') - end - end - if (para =~/[ ^](?:https?|ftp):\/\/\S+/) #url markup http leftovers watch carefully may accept too much 2004w46 - if para=~/([ ^])((?:https?|ftp):\/\/\S+?)([\.,] )/ - para.gsub!(/([ ^])((?:https?|ftp):\/\/\S+?)([\.,] )/,'\1<\2>\3') #full stops ! have been a bother - else - para.gsub!(/([ ^])((?:https?|ftp):\/\/\S+)/,'\1<\2>') - end - end + para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #http ftp matches escaped, no decoration + para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #special case \{ e.g. \}http://url + para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) #http ftp matches with decoration if (para =~/..\/\S+/ and para !~/(\"..\/\S+?\"|>\s*..\/\S+<)/) para.gsub!(/(\.\.\/\S+)/,'\1') end diff --git a/lib/sisu/v0/odf.rb b/lib/sisu/v0/odf.rb index a5ed30a4..cff57888 100644 --- a/lib/sisu/v0/odf.rb +++ b/lib/sisu/v0/odf.rb @@ -295,8 +295,14 @@ module SiSU_ODF end def normal(para) #P1 - P3 para.gsub!(@serial,'') - para.gsub!(/(^|\s)(https?:\/\/[^'">< ]+?)([\.,]?(?: |$))/,%{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) - para.gsub!(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+)/,%{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}}) + para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{\\1\\2\\3}) #http ftp matches escaped, no decoration + para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{\\1\\2\\3}) #special case \{ e.g. \}http://url + para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) #http ftp matches with decoration + para.gsub!(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+)/, + %{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}}) para=case para when /^<:i([1-9])>\s/m m=$1 @@ -369,6 +375,8 @@ module SiSU_ODF parray=[] para.split(/<:?br(?: \/)?>/).each do |parablock| parablock=group_clean(parablock) + parablock.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{\\1\\2\\3}) #http ftp matches escaped, no decoration parray << %{#{parablock}} if parablock =~/\S+/ end para=parray.join + '' diff --git a/lib/sisu/v0/plaintext.rb b/lib/sisu/v0/plaintext.rb index 4bff976a..f08a0871 100644 --- a/lib/sisu/v0/plaintext.rb +++ b/lib/sisu/v0/plaintext.rb @@ -327,7 +327,8 @@ WOK end para.gsub!(/<:p[bn]>/,'') # remove page breaks para.gsub!(/^\s*<~\d+;(?:\w|[0-6]:)\d+;\w\d+><#@dp:#@dp>$/,'') # remove empty lines - check - para.gsub!(/(^|\s)(https?:\/\/[^"><]+?)([,.:;"><]?(?:\s|$))/,"\\1#{@url_brace.txt_open}\\2#{@url_brace.txt_close}\\3") + para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,"\\1#{@url_brace.txt_open}\\2#{@url_brace.txt_close}\\3") + para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') para.gsub!(/(.+?)<\/a>/m,'\1') para.gsub!(/<:name#\S+?>/,'') # remove name links para.gsub!(/ /,' ') # decide on diff --git a/lib/sisu/v0/shared_html_lite.rb b/lib/sisu/v0/shared_html_lite.rb index ed622fb8..5285b6db 100644 --- a/lib/sisu/v0/shared_html_lite.rb +++ b/lib/sisu/v0/shared_html_lite.rb @@ -120,20 +120,9 @@ module SiSU_Format_Shared words=word_mode.join(' ') para.gsub!(/.+/,words) end - if (para !~/(\"\w+:\/\/\S+?\"|>\s*\w+:\/\/\w+?\S*<)/) #url markup http etc. - if para=~/\w+:\/\/\S+?\.\S+?[.,] / - para.gsub!(/(\w+:\/\/\S+?\.\S+?)([.,] )/,%{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}\\2}) #full stops ! have been a bother - else - para.gsub!(/(\w+:\/\/\S+?\.\S+)/,%{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}}) - end - end - if (para =~/[ ^](?:https?|ftp):\/\/\S+/) #url markup http leftovers watch carefully may accept too much 2004w46 - if para=~/([ ^])((?:https?|ftp):\/\/\S+?)([\.,] )/ - para.gsub!(/([ ^])((?:https?|ftp):\/\/\S+?)([\.,] )/,%{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) #full stops ! have been a bother - else - para.gsub!(/([ ^])((?:https?|ftp):\/\/\S+)/,%{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}}) - end - end + para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #http ftp matches escaped, no decoration + para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #special case \{ e.g. \}http://url + para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) #http ftp matches with decoration para end def paragraph diff --git a/lib/sisu/v0/shared_xml.rb b/lib/sisu/v0/shared_xml.rb index bcdf44f5..995044db 100644 --- a/lib/sisu/v0/shared_xml.rb +++ b/lib/sisu/v0/shared_xml.rb @@ -347,8 +347,7 @@ module SiSU_XML_munge def markup(para='') wordlist=para.scan(/\S+|\n/) #\n needed for tables, check though added 2005w17 para=tidywords(wordlist).join(' ').strip - para.gsub!(/(^|\s+)<\s+/,'\1< ') - para.gsub!(/\s+>(\s+|$)/,' >\1') + para.gsub!(/(^|\s+)<\s+/,'\1< '); para.gsub!(/\s+>(\s+|$)/,' >\1') para.gsub!(/<:pb>\s*/,'') para.gsub!(/<+[-~]#>+/,'') para.gsub!(/<0;\w\d+;[um]\d+><#@dp:#@dp>/,'') @@ -357,8 +356,12 @@ module SiSU_XML_munge #para.gsub!(/^_\*\s+/,'* ') para.gsub!(/(^|\s)\{\s*(\S+?\.(?:jpg|png|gif))\s+(\d+)x(\d+)(\s+[^}]+)?\}(https?:\/\/\S+)/,%{\\1[\\2] \\5}) para.gsub!(/(^|\s)\{\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?\}(https?:\/\/\S+)/,%{\\1\\2}) - para.gsub!(/(^|\s)\{([^}]+)\}(https?:\/\/[^"><]+?)([,.:;"><]?(?:\s|$))/,'\1\2\4') - para.gsub!(/(^|\s)(https?:\/\/[^"><]+?)([,.:;"><]?(?:\s|$))/,%{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) + para.gsub!(/(^|\s)\{([^}]+)\}(https?:\/\/[^"><]+?)([,.:;"><]?(?:\s|$))/, + '\1\2\4') #watch, compare html_tune + para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) + para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #escaped urls not linked, deal with later + #para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #escaped urls not linked, deal with later para.gsub!(/ /,' ') #clean para end diff --git a/lib/sisu/v0/texinfo_format.rb b/lib/sisu/v0/texinfo_format.rb index 3184ee80..e4803d87 100644 --- a/lib/sisu/v0/texinfo_format.rb +++ b/lib/sisu/v0/texinfo_format.rb @@ -327,9 +327,10 @@ WOK @para.gsub!(/(\$)/,"\\$") @para.gsub!(/\~/,'\\~') @para.gsub!(/%/,"\\%") - if @para !~ /^\s*<:image|\}:image\s/ - @para.gsub!(/_/,'\_') - end + #if @para !~ /^\s*<:image|\}:image\s/ + # @para.gsub!(/_/,'\_') + #end + @para.gsub!(/_(http:\/\/)/,'\1') @para.gsub!(/§/i,'\S') @para.gsub!(/£/i,'\pounds') @para.gsub!(/å/i,'\aa') diff --git a/lib/sisu/v0/texpdf.rb b/lib/sisu/v0/texpdf.rb index 5eaf7311..6ac87240 100644 --- a/lib/sisu/v0/texpdf.rb +++ b/lib/sisu/v0/texpdf.rb @@ -358,6 +358,7 @@ module SiSU_TeX end @tex_file <<</,' ') @string.gsub!(/<\/a>/,' ') - if @string !~/^\s*<:image|\}image/ #watch carefully could introduce breaks ! removed \}http changed 0.1.1-4 200501 - if @string =~/(?:https?|ftp):\/\/\S+?<\/\S>/ #problems where have font attribute eg (change substitution sequence?) # [^\}>] added for escaped curly brace <=curlyclose> watch - @string.gsub!(/[^\}>]((?:https?|ftp):\/\/\S+?)(<\/\S>)/,' \begin{scriptsize}\href{\1}{ \1 } \end{scriptsize}\2') - else # regular urls !! http:// href - if @string=~/(?:https?|ftp):\/\/\S+?[,.]? / - @string.gsub!(/(?:^|[^\}>])((?:https?|ftp):\/\/\S+?)([,.])? /," #{@url_brace.tex_open}\\begin{scriptsize}\\href{\\1}{ \\1}\\end{scriptsize}#{@url_brace.tex_close}\\2 ") #tamper - else @string.gsub!(/(?:^|[^\}>])((?:https?|ftp):\/\/\S+)/,' \begin{scriptsize}\href{ \1 }{\1} \end{scriptsize}') #should not be necessary, not checked - end - end - else #@string =~/\\~\\\{.+?\\\}\\~/ # \\~\\{1 - #@string.gsub!(/(?:\n)?(http:\/\/\S+)/," \\begin{scriptsize}\\href{\\1}{\\1} \\end{scriptsize}") #bug... - end + @string.gsub!(/[^\}>_]((?:https?|ftp):\/\/\S+?)(<\/\S>)/,' \begin{scriptsize}\href{\1}{\1} \end{scriptsize}\2') #special case + @string.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\begin{scriptsize}\\href{\2}{\2}\end{scriptsize}\3') #special case \{ e.g. \}http://url + @string.gsub!(/(^|\s)(?:\\_|\\)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\begin{scriptsize}\\href{\2}{\2}\end{scriptsize}\3') #specially escaped url no decoration + @string.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,"\\1#{@url_brace.tex_open}\\begin{scriptsize}\\href{\\2}{\\2}\\end{scriptsize}#{@url_brace.tex_close}\\3") #url matching with decoration @string.gsub!(/<:ee>/,'') @string.gsub!(//,' ') #proposed change, insert, but may be redundant @@ -704,10 +696,20 @@ WOK #end BUGWATCH end def indent(lev) - @string.gsub!(/<:i#{lev}>(.*)/m, - "\\begin{ParagraphIndent}{0.0#{lev}\\columnwidth} \\1 -\\end{ParagraphIndent} -") + indent=case lev + when /1/; '0mm' + when /2/; '10mm' + when /3/; '20mm' + when /4/; '30mm' + when /5/; '40mm' + when /6/; '50mm' + when /7/; '60mm' + when /8/; '70mm' + when /9/; '80mm' + end + @string.gsub!(/<:i#{lev}>\s*(.*)/m, + "\\begin{ParagraphIndent}{#{indent}}\\1 +\\end{ParagraphIndent}}") end def symbol_graphic dir=SiSU_Env::Info_env.new(@md.fns) @@ -1192,21 +1194,17 @@ WOK def a4generic end def para_num - paranumber_display=if @md.mod.inspect =~/--no-ocn/; '' #!ocn no object citation numbering + paranumber_display=if @md.mod.inspect =~/--no-ocn/; '' #!ocn no object citation numbering else "\\begin{tiny}~\\end{tiny}{\\marginpar{\\begin{tiny}#@string1\\end{tiny}}}" #ocn object citation numbering end if @string !~/^([1-6a-z-]#{@@tilde}\S*|<:.+?>|#{@md.lv1}|#{@md.lv2}|#{@md.lv3}|#{@md.lv4}|#{@md.lv5}|#{@md.lv6})/ @string.gsub!(/^\s*(.+)/m,"#{paranumber_display}\\1\n") #watch - in 1-6 is suspect else if (@string =~/^(?:[1-6a-z-]#{@@tilde}\S*|<:.+?>)/) #watch - in 1-6 is suspect - @string.gsub!(/^([1-6a-z-]#{@@tilde}\S*|<:.+?>)\s*(.+)/m,"\\1 #{paranumber_display} \\2\n") #watch - in 1-6 is suspect + @string.gsub!(/^([1-6a-z-]#{@@tilde}\S*)\s*(.+)/m,"\\1 #{paranumber_display} \\2\n") #watch - in 1-6 is suspect + @string.gsub!(/^(<:.+?>)\s*(.+)/m,"\\1 #{paranumber_display}\\2\n") else - @string.gsub!(/(#{@md.lv1}.+)$/,"#{@md.lv1} #{paranumber_display}\\1\n") - @string.gsub!(/(#{@md.lv2}.+)$/,"#{@md.lv2} #{paranumber_display}\\1\n") - @string.gsub!(/(#{@md.lv3}.+)$/,"#{@md.lv3} #{paranumber_display}\\1\n") - @string.gsub!(/(#{@md.lv4}.+)$/,"#{@md.lv4} #{paranumber_display}\\1\n") - @string.gsub!(/(#{@md.lv5}.+)$/,"#{@md.lv5} #{paranumber_display}\\1\n") - @string.gsub!(/(#{@md.lv6}.+)$/,"#{@md.lv6} #{paranumber_display}\\1\n") + @string.gsub!(/((#{@md.lv1}|#{@md.lv2}|#{@md.lv3}|#{@md.lv4}|#{@md.lv5}|#{@md.lv6}).+)$/,"\\2 #{paranumber_display}\\1\n") end end @string -- cgit v1.2.3