summaryrefslogtreecommitdiffstats
path: root/lib/sisu/v5/harvest_authors.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sisu/v5/harvest_authors.rb')
-rw-r--r--lib/sisu/v5/harvest_authors.rb407
1 files changed, 407 insertions, 0 deletions
diff --git a/lib/sisu/v5/harvest_authors.rb b/lib/sisu/v5/harvest_authors.rb
new file mode 100644
index 0000000..89a64fe
--- /dev/null
+++ b/lib/sisu/v5/harvest_authors.rb
@@ -0,0 +1,407 @@
+# encoding: utf-8
+=begin
+
+ * Name: SiSU
+
+ * Description: a framework for document structuring, publishing and search
+ metadata harvest, extract authors and their writings from document set
+
+ * Author: Ralph Amissah
+
+ * Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
+ 2007, 2008, 2009, 2010, 2011, 2012, 2013 Ralph Amissah, All Rights Reserved.
+
+ * License: GPL 3 or later:
+
+ SiSU, a framework for document structuring, publishing and search
+
+ Copyright (C) Ralph Amissah
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the Free
+ Software Foundation, either version 3 of the License, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program. If not, see <http://www.gnu.org/licenses/>.
+
+ If you have Internet connection, the latest version of the GPL should be
+ available at these locations:
+ <http://www.fsf.org/licensing/licenses/gpl.html>
+ <http://www.gnu.org/licenses/gpl.html>
+
+ <http://www.sisudoc.org/sisu/en/manifest/gpl.fsf.html>
+
+ * SiSU uses:
+ * Standard SiSU markup syntax,
+ * Standard SiSU meta-markup syntax, and the
+ * Standard SiSU object citation numbering and system
+
+ * Hompages:
+ <http://www.jus.uio.no/sisu>
+ <http://www.sisudoc.org>
+
+ * Download:
+ <http://www.sisudoc.org/sisu/en/SiSU/download.html>
+
+ * Git
+ <http://sources.sisudoc.org/gitweb/?p=code/sisu.git;a=summary>
+ <http://sources.sisudoc.org/?p=code/sisu.git;a=blob;f=lib/sisu/v5/harvest_authors.rb;hb=HEAD>
+
+ * Ralph Amissah
+ <ralph@amissah.com>
+ <ralph.amissah@gmail.com>
+
+ ** Description: simple xml representation (sax style)
+
+=end
+module SiSU_HarvestAuthors
+ require_relative 'author_format' # author_format.rb
+ class Songsheet
+ @@the_idx_authors={}
+ def initialize(opt,env)
+ @opt,@env=opt,env
+ @file_list=opt.files
+ end
+ def songsheet
+ idx_array={}
+ @opt.f_pths.each do |y|
+ lang_hash_file_array={}
+ name=y[:f]
+ filename=y[:pth] + '/' + y[:f]
+ File.open(filename,'r') do |file|
+ file.each_line("\n\n") do |line|
+ if line =~/^@(?:title|creator|date):(?:\s|$)/m
+ lang_hash_file_array[y[:lng_is]] ||= []
+ lang_hash_file_array[y[:lng_is]] << line
+ elsif line =~/^@\S+?:(?:\s|$)/m \
+ or line =~/^(?:\s*\n|%+ )/
+ else break
+ end
+ end
+ end
+ lang_hash_file_array.each_pair do |lang,a|
+ idx_array[lang] ||= []
+ idx_array=SiSU_HarvestAuthors::Harvest.new(@opt,@env,a,filename,name,idx_array,lang).extract_harvest
+ end
+ end
+ the_idx=SiSU_HarvestAuthors::Index.new(idx_array,@@the_idx_authors).construct_book_author_index
+ SiSU_HarvestAuthors::OutputIndex.new(@opt,the_idx).html_print.html_songsheet
+ end
+ end
+ class Harvest
+ def initialize(opt,env,data,filename,name,idx_array,lang)
+ @opt,@env,@data,@filename,@name,@idx_array,@lang=opt,env,data,filename,name,idx_array,lang
+ end
+ def extract_harvest
+ data,filename,name,idx_array,lang=@data,@filename,@name,@idx_array,@lang
+ @title,@subtitle,@fulltitle,@author,@author_format,@date=nil,nil,nil,nil,nil,nil
+ @authors=[]
+ rgx={}
+ rgx[:author]=/^@creator:(?:[ ]+|.+?:author:[ ]+)(.+?)(?:\||\n)/m
+ rgx[:title]=/^@title:[ ]+(.+)/
+ rgx[:subtitle]=/^@title:.+?:subtitle:[ ]+(.+?)\n/m
+ rgx[:date]=/^@date:(?:[ ]+|.+?:published:[ ]+)(\d{4})/m
+ data.each do |para|
+ if para=~ rgx[:title]
+ @title=rgx[:title].match(para)[1]
+ end
+ if para=~ rgx[:subtitle]
+ @subtitle=rgx[:subtitle].match(para)[1]
+ end
+ if para=~ rgx[:author]
+ @author_format=rgx[:author].match(para)[1]
+ end
+ if para=~ rgx[:date]
+ @date=rgx[:date].match(para)[1]
+ end
+ break if @title && @subtitle && @author && @date
+ end
+ @fulltitle=@subtitle ? (@title + ' - ' + @subtitle) : @title
+ if @title \
+ and @author_format
+ creator=SiSU_FormatAuthor::Author.new(@author_format.strip).author_details
+ @authors,@authorship=creator[:authors],creator[:authorship]
+ file=if name=~/~[a-z]{2,3}\.ss[mt]$/
+ name.sub(/~[a-z]{2,3}\.ss[mt]$/,'')
+ else
+ name.sub(/\.ss[mt]$/,'')
+ end
+ page=if @env.output_dir_structure.by? == :language
+ "#{lang}/sisu_manifest.html"
+ else
+ "sisu_manifest.#{lang}.html"
+ end
+ idx_array[lang] <<= { filename: filename, file: file, date: @date, title: @fulltitle, author: creator, page: page, lang: lang }
+ else
+ #p "missing author field: #{@filename} title: #{@title}; author: #{@author_format}"
+ end
+ idx_array[lang]=idx_array[lang].flatten
+ idx_array
+ end
+ end
+ class Index
+ def initialize(idx_array,the_idx)
+ @idx_array,@the_idx=idx_array,the_idx
+ @@the_idx_authors=@the_idx
+ end
+ def capital(txt)
+ txt[0].chr.capitalize + txt[1,txt.length]
+ end
+ def construct_book_author_index
+ idx_array=@idx_array
+ idx_array.each_pair do |lang,idx_array|
+ @@the_idx_authors[lang] ||= {}
+ idx_array.each do |idx|
+ idx[:author][:last_first_format_a].each do |author|
+ author=author.strip
+ if @@the_idx_authors[lang][author].is_a?(NilClass)
+ @@the_idx_authors[lang][author]={ md: [] }
+ end
+ @@the_idx_authors[lang][author][:md] << { filename: idx[:filename], file: idx[:file], author: idx[:author], title: idx[:title], date: idx[:date], page: idx[:page], lang: idx[:lang] }
+ end
+ end
+ end
+ @the_idx=@@the_idx_authors
+ end
+ end
+ class OutputIndex
+ require_relative 'i18n' # i18n.rb
+ def initialize(opt,the_idx)
+ @opt,@the_idx=opt,the_idx
+ @env=SiSU_Env::InfoEnv.new
+ @rc=SiSU_Env::GetInit.new.sisu_yaml.rc
+ @alphabet_list=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z]
+ @alph=@alphabet_list.dup
+ @letter=@alph.shift
+ @vz=SiSU_Viz::Defaults.new
+ end
+ def html_file_open
+ @the_idx.keys.each do |lng|
+ @output ||={}
+ @output[lng] ||={}
+ harvest_pth,file='',''
+ if @env.output_dir_structure.by? == :language
+ harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}/#{lng}/manifest"
+ file="#{harvest_pth}/authors.html"
+ elsif @env.output_dir_structure.by? == :filetype
+ harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}/manifest"
+ file="#{harvest_pth}/authors.#{lng}.html"
+ elsif @env.output_dir_structure.by? == :filename
+ harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}"
+ file="#{harvest_pth}/authors.#{lng}.html"
+ end
+ FileUtils::mkdir_p(harvest_pth) unless FileTest.directory?(harvest_pth)
+ fileinfo=(@opt.cmd =~/[vVM]/) ? ("file://#{file}") : ''
+ SiSU_Screen::Ansi.new(@opt.cmd,"harvest authors (#{@opt.files.length} files)",fileinfo).dark_grey_title_hi unless @opt.cmd =~/q/
+ @output[lng][:html]=File.new(file,'w')
+ end
+ end
+ def html_file_close
+ @the_idx.keys.each do |lng|
+ @output[lng][:html].close
+ @output[lng][:html_mnt].close if @output[lng][:html_mnt].is_a?(File)
+ end
+ end
+ def html_print
+ def html_songsheet
+ html_file_open
+ html_head
+ html_alph
+ html_body
+ html_tail
+ html_file_close
+ end
+ def html_head_adjust(lng,type='')
+ css_path,topics='',''
+ if @env.output_dir_structure.by? == :language
+ css_path=(type !~/maintenance/) \
+ ? '../../_sisu/css/harvest.css'
+ : 'harvest.css'
+ topics='topics.html'
+ elsif @env.output_dir_structure.by? == :filetype
+ css_path=(type !~/maintenance/) \
+ ? '../_sisu/css/harvest.css'
+ : 'harvest.css'
+ topics="topics.#{lng}.html"
+ elsif @env.output_dir_structure.by? == :filename
+ css_path=(type !~/maintenance/) \
+ ? './_sisu/css/harvest.css'
+ : 'harvest.css'
+ topics="topics.#{lng}.html"
+ end
+ ln=SiSU_i18n::Languages.new.language.list
+ harvest_languages=''
+ @the_idx.keys.each do |lng|
+ if @env.output_dir_structure.by? == :language
+ harvest_pth="../../#{lng}/manifest"
+ file="#{harvest_pth}/authors.html"
+ elsif @env.output_dir_structure.by? == :filetype
+ harvest_pth='.'
+ file="#{harvest_pth}/authors.#{lng}.html"
+ elsif @env.output_dir_structure.by? == :filename
+ harvest_pth='.'
+ file="#{harvest_pth}/authors.#{lng}.html"
+ end
+ l=ln[lng][:t]
+ harvest_languages += %{<a href="#{file}">#{l}</a>&nbsp;&nbsp;&nbsp;}
+ end
+ sv=SiSU_Env::InfoVersion.instance.get_version
+ if @env.output_dir_structure.by? == :language
+ home_pth='../..'
+ output_structure_by='(output organised by language &amp; filetype)'
+ elsif @env.output_dir_structure.by? == :filetype
+ home_pth='..'
+ output_structure_by='(output organised by filetype)'
+ elsif @env.output_dir_structure.by? == :filename
+ home_pth='.'
+ output_structure_by='(output organised by filename)'
+ else
+ home_pth='.'
+ output_structure_by='(output organised by ?)'
+ end
+ <<WOK
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>SiSU Metadata Harvest - Authors</title>
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
+<meta name="dc.title" content= "SiSU metadata harvest, Authors - SiSU information Structuring Universe, Structured information Serialised Units" />
+<meta name="dc.subject" content= "document structuring, ebook, publishing, PDF, LaTeX, XML, ODF, SQL, postgresql, sqlite, electronic book, electronic publishing, electronic document, electronic citation, data structure, citation systems, granular search, digital library" />
+<meta name="generator" content="#{sv[:project]} #{sv[:version]} of #{sv[:date_stamp]} (n*x and Ruby!)" />
+<link rel="generator" href="http://www.jus.uio.no/sisu/SiSU" />
+<link rel="stylesheet" href="#{css_path}" type="text/css" />
+<link rel="shortcut icon" href="../_sisu/image/rb7.ico" />
+</head>
+<body bgcolor="#ffffff" text="#000000" link="#003090" lang="en" xml:lang="en">
+<a name="top" id="top"></a>
+<a name="up" id="up"></a>
+<a name="start" id="start"></a>
+<h1>SiSU Metadata Harvest - Authors #{output_structure_by}</h1>
+<p>[<a href="#{home_pth}/index.html">&nbsp;HOME&nbsp;</a>] also see <a href="#{topics}">SiSU Metadata Harvest - Topics</a></p>
+<p>#{@env.widget_static.search_form}</p>
+<hr />
+<p class="tiny">#{harvest_languages}</p>
+<hr />
+WOK
+ end
+ def html_head
+ @the_idx.keys.each do |lng|
+ @output[lng][:html_mnt] << html_head_adjust(lng,'maintenance') if @opt.cmd.inspect =~/M/
+ @output[lng][:html] << html_head_adjust(lng)
+ end
+ end
+ def html_alph
+ a=[]
+ a << '<p>'
+ @alph.each do |x|
+ a << ((x =~/[0-9]/) \
+ ? ''
+ : %{<a href="##{x}">#{x}</a>,&nbsp;})
+ end
+ a=a.join
+ @the_idx.keys.each do |lng|
+ @output[lng][:html_mnt] << a if @opt.cmd.inspect =~/M/
+ @output[lng][:html] << a
+ end
+ end
+ def html_tail
+ a =<<WOK
+<hr />
+<a name="bottom" id="bottom"></a>
+<a name="down" id="down"></a>
+<a name="end" id="end"></a>
+<a name="finish" id="finish"></a>
+<a name="stop" id="stop"></a>
+<a name="credits"></a>
+#{@vz.credits_sisu}
+</body>
+</html>
+WOK
+ @the_idx.keys.each do |lng|
+ @output[lng][:html_mnt] << a if @output[lng][:html_mnt].is_a?(File)
+ @output[lng][:html] << a
+ end
+ end
+ def do_html(lng,html)
+ @output[lng][:html_mnt] << html if @output[lng][:html_mnt].is_a?(File)
+ @output[lng][:html] << html
+ end
+ def do_string_name(lng,attrib,string)
+ f=/^(\S)/.match(string[0])[1]
+ if @lng != lng
+ @alph=@alphabet_list.dup
+ @letter=@alph.shift
+ @lng = lng
+ end
+ if @letter < f
+ while @letter < f
+ if @alph.length > 0
+ @letter=@alph.shift
+ if @output[lng][:html_mnt].is_a?(File)
+ @output[lng][:html_mnt] << %{\n<p class="letter"><a name="#{@letter}"></p>#{@letter}</a><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>}
+ end
+ @output[lng][:html] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>}
+ else break
+ end
+ end
+ end
+ end
+ def html_body
+ the_idx=@the_idx
+ the_idx.each_pair do |lng,lng_array|
+ lng_array.sort.each do |a|
+ do_string_name(lng,'',a)
+ name=a[0].sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_')
+ x = %{<p class="author"><a name="#{name}">#{a[0]}</a></p>}
+ if @output[lng][:html_mnt].is_a?(File)
+ @output[lng][:html_mnt] << x
+ end
+ @output[lng][:html] << x
+ works=[]
+ a[1][:md].each do |x|
+ manifest_at=if @env.output_dir_structure.by? == :language
+ manifest_pth="#{@env.path.output}/#{x[:file]}"
+ x[:file] + '.html'
+ elsif @env.output_dir_structure.by? == :filetype
+ manifest_name=x[:file]
+ x[:file] + '.' + lng + '.html'
+ elsif @env.output_dir_structure.by? == :filename
+ "./#{x[:file]}/#{x[:page]}"
+ else '' #error
+ end
+ work=[ "#{x[:date]} #{x[:title]}", %{<p class="publication">#{x[:date]} <a href="#{manifest_at}">#{x[:title]}</a>, #{x[:author][:authors_s]}</p>} ]
+ works<<=(@output[lng][:html_mnt].is_a?(File)) \
+ ? (work.concat([%{<p class="publication">[<a href="#{x[:file]}.sst">src</a>]&nbsp;&nbsp;#{x[:date]} <a href="file://#{manifest_at}">#{x[:title]}</a>, #{x[:author][:authors_s]} -- [<a href="#{x[:file]}.sst">#{x[:file]}.sst</a>]</p>}]))
+ : work
+ end
+ works.sort_by {|x| x[0]}.each do |x|
+ @output[lng][:html] << x[1]
+ @output[lng][:html_mnt] << x[2] if @output[lng][:html_mnt].is_a?(File)
+ end
+ end
+ end
+ end
+ self
+ end
+ def screen_print
+ def cycle
+ the_idx=@the_idx
+ the_idx.sort.each do |a|
+ puts a[0]
+ a[1][:md].each do |x|
+ puts "\t" + x[:file]
+ end
+ end
+ end
+ self
+ end
+ end
+end
+__END__