From a1fd226ef8ae434f81f010ee8681fc059dbbe6f2 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Thu, 26 Jul 2007 17:51:16 +0100 Subject: multiple url matching refinements, open archive initiative --- CHANGELOG | 13 ++++++------- lib/sisu/v0/html_tune.rb | 4 ++-- lib/sisu/v0/odf.rb | 12 ++++++------ lib/sisu/v0/shared_html_lite.rb | 4 ++-- lib/sisu/v0/shared_xml.rb | 10 +++++----- lib/sisu/v0/texpdf_format.rb | 4 ++-- lib/sisu/v0/xml_md_oai_pmh_dc.rb | 5 +++++ 7 files changed, 28 insertions(+), 24 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 93d3ed72..2db96703 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -12,20 +12,18 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.6.orig.tar.gz sisu_0.55.6-1.dsc sisu_0.55.6-1.diff.gz - * db html, fix related to match of multiple urls within paragraph + * matching of multiple urls within a paragraph + * db html (html_lite), bug fix + * multiple uls listed, refinement: html, html_lite, xml, odf, texpdf * open archive initiative for metadata harvesting, initial implementation, Dublin Core, XML output available (-O), decide use later (filenames, output - dir etc.), look at later and refine accordingly: - http://www.openarchives.org/pmh/ - http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm#dublincore - http://es.dublincore.org/documents/usageguide/elements.shtml - http://dublincore.org/documents/dces/ - see also http://dublincore.org/documents/dcmes-xml/ + dir etc.) * debian vim * moved vim install back to addons * added recommends vim-addon-manager + (thanks zack) %% sisu_0.55.5.orig.tar.gz (2007-07-22:29/7) http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.5.orig.tar.gz @@ -47,6 +45,7 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.5.orig.tar.gz open standards * debian vim, syntax and ftplugin install moved to /usr/share/vim-scripts + (syntax file synced with Bram, thanks) %% sisu_0.55.4.orig.tar.gz (2007-07-20:29/5) http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.4.orig.tar.gz diff --git a/lib/sisu/v0/html_tune.rb b/lib/sisu/v0/html_tune.rb index 1d3461c3..66c45aed 100644 --- a/lib/sisu/v0/html_tune.rb +++ b/lib/sisu/v0/html_tune.rb @@ -325,9 +325,9 @@ module SiSU_Tune if (para =~/\b\S+\@\S+?\.\S+/ and para !~/(\"\S+\@\S+?\.\S+\"|>\S+\@\S+?\.\S+?<)/) para.gsub!(/\b(\S+\@\S+?\.\S+)(\s)/,'<\1>\2') end - para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #http ftp matches escaped, no decoration + para.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2') #http ftp matches escaped, no decoration para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #special case \{ e.g. \}http://url - para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) #http ftp matches with decoration + para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}\\2}) #http ftp matches with decoration if (para =~/..\/\S+/ and para !~/(\"..\/\S+?\"|>\s*..\/\S+<)/) para.gsub!(/(\.\.\/\S+)/,'\1') end diff --git a/lib/sisu/v0/odf.rb b/lib/sisu/v0/odf.rb index cff57888..fbd4cc62 100644 --- a/lib/sisu/v0/odf.rb +++ b/lib/sisu/v0/odf.rb @@ -295,12 +295,12 @@ module SiSU_ODF end def normal(para) #P1 - P3 para.gsub!(@serial,'') - para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, - %{\\1\\2\\3}) #http ftp matches escaped, no decoration + para.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{\\1\\2}) #http ftp matches escaped, no decoration para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, %{\\1\\2\\3}) #special case \{ e.g. \}http://url - para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, - %{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) #http ftp matches with decoration + para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}\\2}) #http ftp matches with decoration para.gsub!(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+)/, %{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}}) para=case para @@ -375,8 +375,8 @@ module SiSU_ODF parray=[] para.split(/<:?br(?: \/)?>/).each do |parablock| parablock=group_clean(parablock) - parablock.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, - %{\\1\\2\\3}) #http ftp matches escaped, no decoration + parablock.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{\\1\\2}) #http ftp matches escaped, no decoration parray << %{#{parablock}} if parablock =~/\S+/ end para=parray.join + '' diff --git a/lib/sisu/v0/shared_html_lite.rb b/lib/sisu/v0/shared_html_lite.rb index 2bcea532..16491ebf 100644 --- a/lib/sisu/v0/shared_html_lite.rb +++ b/lib/sisu/v0/shared_html_lite.rb @@ -131,9 +131,9 @@ module SiSU_Format_Shared words=word_mode.join(' ') para.gsub!(/.+/,words) end - para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #http ftp matches escaped, no decoration + para.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2') #http ftp matches escaped, no decoration para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #special case \{ e.g. \}http://url - para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) #http ftp matches with decoration + para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}\\2}) #http ftp matches with decoration para end def paragraph diff --git a/lib/sisu/v0/shared_xml.rb b/lib/sisu/v0/shared_xml.rb index 995044db..249085a1 100644 --- a/lib/sisu/v0/shared_xml.rb +++ b/lib/sisu/v0/shared_xml.rb @@ -356,11 +356,11 @@ module SiSU_XML_munge #para.gsub!(/^_\*\s+/,'* ') para.gsub!(/(^|\s)\{\s*(\S+?\.(?:jpg|png|gif))\s+(\d+)x(\d+)(\s+[^}]+)?\}(https?:\/\/\S+)/,%{\\1[\\2] \\5}) para.gsub!(/(^|\s)\{\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?\}(https?:\/\/\S+)/,%{\\1\\2}) - para.gsub!(/(^|\s)\{([^}]+)\}(https?:\/\/[^"><]+?)([,.:;"><]?(?:\s|$))/, - '\1\2\4') #watch, compare html_tune - para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, - %{\\1#{@url_brace.xml_open}\\2#{@url_brace.xml_close}\\3}) - para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #escaped urls not linked, deal with later + para.gsub!(/\B\{([^}]+)\}(https?:\/\/[^"><]+?)([,.:;"><]?(?:\s|$))/, + '\1\3') #watch, compare html_tune + para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/, + %{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}\\2}) + para.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2') #escaped urls not linked, deal with later #para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #escaped urls not linked, deal with later para.gsub!(/ /,' ') #clean para diff --git a/lib/sisu/v0/texpdf_format.rb b/lib/sisu/v0/texpdf_format.rb index 4a8d2cb5..81646f23 100644 --- a/lib/sisu/v0/texpdf_format.rb +++ b/lib/sisu/v0/texpdf_format.rb @@ -501,8 +501,8 @@ WOK @string.gsub!(/<\/a>/,' ') @string.gsub!(/[^\}>_]((?:https?|ftp):\/\/\S+?)(<\/\S>)/,' \begin{scriptsize}\href{\1}{\1} \end{scriptsize}\2') #special case @string.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\begin{scriptsize}\\href{\2}{\2}\end{scriptsize}\3') #special case \{ e.g. \}http://url - @string.gsub!(/(^|\s)(?:\\_|\\)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\begin{scriptsize}\\href{\2}{\2}\end{scriptsize}\3') #specially escaped url no decoration - @string.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,"\\1#{@url_brace.tex_open}\\begin{scriptsize}\\href{\\2}{\\2}\\end{scriptsize}#{@url_brace.tex_close}\\3") #url matching with decoration + @string.gsub!(/\B(?:\\_|\\)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\begin{scriptsize}\\href{\1}{\1}\end{scriptsize}\2') #specially escaped url no decoration + @string.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?=\s|$))/,"\\1#{@url_brace.tex_open}\\begin{scriptsize}\\href{\\2}{\\2}\\end{scriptsize}#{@url_brace.tex_close}\\3") #url matching with decoration positive lookahead, sequence issue with { linked }http://url cannot use \b at start @string.gsub!(/<:ee>/,'') @string.gsub!(//,' ') #proposed change, insert, but may be redundant diff --git a/lib/sisu/v0/xml_md_oai_pmh_dc.rb b/lib/sisu/v0/xml_md_oai_pmh_dc.rb index 7ac7c3a6..1d7008a1 100644 --- a/lib/sisu/v0/xml_md_oai_pmh_dc.rb +++ b/lib/sisu/v0/xml_md_oai_pmh_dc.rb @@ -182,6 +182,11 @@ WOK end end __END__ +http://www.openarchives.org/pmh/ +http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm#dublincore +http://es.dublincore.org/documents/usageguide/elements.shtml +http://dublincore.org/documents/dces/ +see also http://dublincore.org/documents/dcmes-xml/ #http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm#dublincore #sample implementation, e.g. 2 -- cgit v1.2.3