diff options
Diffstat (limited to 'lib/sisu/v2/db_import.rb')
-rw-r--r-- | lib/sisu/v2/db_import.rb | 901 |
1 files changed, 901 insertions, 0 deletions
diff --git a/lib/sisu/v2/db_import.rb b/lib/sisu/v2/db_import.rb new file mode 100644 index 00000000..c2ce648c --- /dev/null +++ b/lib/sisu/v2/db_import.rb @@ -0,0 +1,901 @@ +# coding: utf-8 +=begin + + * Name: SiSU + + * Description: a framework for document structuring, publishing and search + + * Author: Ralph Amissah + + * Copyright: (C) 1997 - 2010, Ralph Amissah, All Rights Reserved. + + * License: GPL 3 or later: + + SiSU, a framework for document structuring, publishing and search + + Copyright (C) Ralph Amissah + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program. If not, see <http://www.gnu.org/licenses/>. + + If you have Internet connection, the latest version of the GPL should be + available at these locations: + <http://www.fsf.org/licensing/licenses/gpl.html> + <http://www.gnu.org/licenses/gpl.html> + + <http://www.jus.uio.no/sisu/gpl.fsf/toc.html> + <http://www.jus.uio.no/sisu/gpl.fsf/doc.html> + <http://www.jus.uio.no/sisu/gpl.fsf/plain.txt> + + * SiSU uses: + * Standard SiSU markup syntax, + * Standard SiSU meta-markup syntax, and the + * Standard SiSU object citation numbering and system + + * Hompages: + <http://www.jus.uio.no/sisu> + <http://www.sisudoc.org> + + * Download: + <http://www.jus.uio.no/sisu/SiSU/download.html> + + * Ralph Amissah + <ralph@amissah.com> + <ralph.amissah@gmail.com> + + ** Description: modules shared by the different db types, dbi, postgresql, + sqlite + +=end +module SiSU_DB_import + require "#{SiSU_lib}/db_columns" # db_columns.rb + require "#{SiSU_lib}/db_load_tuple" # db_load_tuple.rb + require "#{SiSU_lib}/shared_html_lite" # shared_html_lite.rb + require 'sqlite3' + class Import < SiSU_DB_columns::Column_size + include SiSU_Param + include SiSU_Screen + @@dl=nil + @@hname=nil + attr_accessor :tp + def initialize(opt,conn,file,sql_type='pg') + @opt,@conn,@file,@sql_type=opt,conn,file,sql_type + @cX=SiSU_Screen::Ansi.new(@opt.cmd).cX + @env=SiSU_Env::Info_env.new(@opt.fns) + @dal="#{@env.path.dal}" + if @opt.fns.empty? or @opt.cmd.empty?; @fnb='' + else + @md=SiSU_Param::Parameters.new(@opt).get + @fnb=@md.fnb + end + @suffix=@opt.fns[/(?:.+?)(?:\.ssm\.sst|\.-?sst)/,1] + @fnc="#{@dal}/#{@opt.fns}.content.rbm" + @@seg,@@seg_full='','' #create? consider placing field just before clean text as opposed to seg which contains seg(.html) name info seg_full would contain seg info for levels 5 & 6 where available eg seg_full may be 7.3 (level 5) and 7.3.1 (level 6) where seg is 7 + @col=Hash.new('') + @col[:ocn]='' + @counter={} + @db=SiSU_Env::Info_db.new + @driver_sqlite3=(@conn.inspect.match(/^(.{10})/)[1]==@db.sqlite.conn_sqlite3.inspect.match(/^(.{10})/)[1]) ? true : false + sql='SELECT MAX(lid) FROM documents' + begin + @col[:lid] ||=0 + @col[:lid]=if @driver_sqlite3 + @conn.execute( sql ).join.to_i + else @conn.execute( sql ) { |x| x.fetch_all.to_s.to_i } + end + rescue + puts "#{__FILE__}:#{__LINE__}" if @opt.cmd =~/M/ + end + @col[:lid]=0 if @col[:lid].nil? or @col[:lid].to_s.empty? + sql='SELECT MAX(nid) FROM endnotes' + begin + @id_n ||=0 + @id_n=if @driver_sqlite3 + @conn.execute( sql ).join.to_i + else + @id_n=@conn.execute( sql ) { |x| x.fetch_all.to_s.to_i } + end + rescue + puts "#{__FILE__}:#{__LINE__}" if @opt.cmd =~/M/ + end + @id_n =0 if @col[:lid].nil? or @col[:lid].to_s.empty? + @col[:lv1]=@col[:lv2]=@col[:lv3]=@col[:lv4]=@col[:lv5]=@col[:lv6]=0 + @db=SiSU_Env::Info_db.new + @@dl ||=SiSU_Env::Info_env.new.digest.length + end + def marshal_load + require "#{SiSU_lib}/dal" # dal.rb + @dal_array=SiSU_DAL::Source.new(@opt).get # dal file drawn here + tell=SiSU_Screen::Ansi.new(@opt.cmd,"#{@db.psql.db}::#{@opt.fns}") + tell.puts_blue unless @opt.cmd =~/q/ + tell=SiSU_Screen::Ansi.new(@opt.cmd,'Marshal Load',@fnc) + tell.print_grey if @opt.cmd =~/v/ + file_exist=if @sql_type=~/sqlite/; nil + else + @conn.select_one(%{ SELECT metadata.tid FROM metadata WHERE metadata.filename ~ '#{@opt.fns}'; }) + end + if (@sql_type!~/sqlite/ and not file_exist) \ + or @sql_type=~/sqlite/ + t_d=[] # transaction_data + t_d << db_import_metadata + t_d << db_import_documents(@dal_array) + t_d << db_import_urls(@dal_array,@fnc) #import OID on/off + t_d=t_d.flatten + if @opt.cmd =~/[MV]/ + puts @conn.class if defined? @conn.class + puts @conn.driver_name if defined? @conn.driver_name + puts @conn.driver if defined? @conn.driver + end + begin + sql='' + if @sql_type=~/sqlite/ + @conn.transaction do |conn| + t_d.each do |sql| + conn.execute(sql) + end + end + #also 'execute' works for sqlite + #@conn.execute("BEGIN") + # t_d.each do |sql| + # @conn.execute(sql) + # end + #@conn.execute("COMMIT") + else + #'do' works for postgresql + @conn.do("BEGIN") + t_d.each do |sql| + @conn.do(sql) + end + @conn.do("COMMIT") + end + rescue DBI::DatabaseError => e + puts "Error code: #{e.err}" + puts "Error message: #{e.errstr}" + puts "Error SQLSTATE: #{e.state}" + SiSU_Errors::Info_error.new($!,$@,@opt.cmd,@opt.fns).error + sqlfn="#{@env.path.sql}/#{@md.fnb}.sql" + sql=File.new(sqlfn,'w') + t_d.each {|i| sql.puts i} + p sqlfn + if @opt.cmd =~/M/ + puts sql + p @conn.methods.sort + puts "#{__FILE__}:#{__LINE__}" + end + rescue + SiSU_Errors::Info_error.new($!,$@,@opt.cmd,@opt.fns).error + sqlfn="#{@env.path.sql}/#{@md.fnb}.sql" + sql=File.new(sqlfn,'w') + t_d.each {|i| sql.puts i} + p sqlfn + if @opt.cmd =~/M/ + puts sql + p @conn.methods.sort + puts "#{__FILE__}:#{__LINE__}" + end + ensure + end + else + if file_exist + @db=SiSU_Env::Info_db.new + puts "\n#{@cX.grey}file #{@cX.off} #{@cX.blue}#{@opt.fns}#{@cX.off} #{@cX.grey}already exists in database#{@cX.off} #{@cX.blue}#{@db.psql.db}#{@cX.off} #{@cX.brown}update instead?#{@cX.off}" + end + end + end + def special_character_escape(str) + str.gsub!(/'/,"''") #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") + str.gsub!(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"<br />\n") + str.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check + str.gsub!(/#{Mx[:lnk_o]}\s*(\S+?\.(?:png|jpg))(?:\s+\d+x\d+)?(.+?)#{Mx[:lnk_c]}\S+/,'[image: \1] \2') + str.gsub!(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2') + str.gsub!(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1') + end + def strip_markup(str) #define rules, make same as in dal clean + str.gsub!(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]') + str.gsub!(/(?: \\;|#{Mx[:nbsp]})+/,' ') + str.gsub!(/#{Mx[:tc_o]}#{Mx[:tc_p]}#{Mx[:tc_p]}\d+(.+)#{Mx[:tc_c]}/u,'\1') #tables + str.gsub!(/#{Mx[:tc_p]}#{Mx[:tc_p]}\d+#{Mx[:tc_p]}/u,' ') #tables + str.gsub!(/#{Mx[:tc_p]}/u,' ') #tables tidy later + str.gsub!(/<.+?>/,'') + str.gsub!(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}(?:file|ftp)\/\/:\S+ /,' [image] ') # else image names found in search + str.gsub!(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,' [image]') # else image names found in search + str.gsub!(/\s\s+/,' ') + str.strip! + end + def pf_db_import_transaction_open + end + def pf_db_import_transaction_close + end + def db_import_metadata #% import documents - populate database + print %{ #{@cX.grey}import documents dbi_unit #{@cX.off} } unless @opt.cmd =~/q/ + @tp={} + @md=SiSU_Param::Parameters.new(@opt).get + if defined? @md.title.full \ + and @md.title.full=~/\S+/ # DublinCore 1 - title + @tp[:title]=@md.title.all + special_character_escape(@tp[:title]) + @tp[:title_f],@tp[:title_i]='title, ',"'#{@tp[:title]}', " + sql='SELECT MAX(tid) FROM metadata' + begin + @@id_t ||=0 + id_t=if @driver_sqlite3 + @conn.execute( sql ).join.to_i # { |x| id_t=x.join.to_i } + else @conn.execute( sql ) { |x| x.fetch_all.to_s.to_i } + end + @@id_t=id_t if id_t + rescue + puts "#{__FILE__} #{__LINE__}" if @opt.cmd =~/M/ + end + @@id_t =0 if @col[:lid].nil? or @col[:lid].to_s.empty? + @@id_t+=1 #bug related, needs to be performed once at start of file, but consider moving, as, placed here it means program will fail if document header lacks @title: + puts %{\n#{@cX.grey}Processing file number#{@cX.off}: #{@cX.green}#{@@id_t}#{@@cX.off}} unless @opt.cmd =~/q/ + end + if defined? @md.creator.author \ + and @md.creator.author=~/\S+/ # DublinCore 2 - creator/author (author) + txt=@md.creator.author #dc + special_character_escape(txt) + @tp[:creator_f],@tp[:creator_i]='creator, ',"'#{txt}', " + end + if defined? @md.creator.contributor \ + and @md.creator.contributor=~/\S+/ # DublinCore 6 - contributor + txt=@md.creator.contributor #dc + special_character_escape(txt) + @tp[:contributor_f],@tp[:contributor_i]='contributor, ',"'#{txt}', " + end + if defined? @md.creator.translator \ + and @md.creator.translator=~/\S+/ + txt=@md.creator.translator + special_character_escape(txt) + @tp[:translator_f],@tp[:translator_i]='translator, ',"'#{txt}', " + end + if defined? @md.creator.illustrator \ + and @md.creator.illustrator=~/\S+/ + txt=@md.creator.illustrator + special_character_escape(txt) + @tp[:illustrator_f],@tp[:illustrator_i]='illustrator, ',"'#{txt}', " + end + if defined? @md.publisher \ + and @md.publisher + txt=@md.publisher #dc + special_character_escape(txt) + @tp[:publisher_f],@tp[:publisher_i]='publisher, ',"'#{txt}', " + end + if defined? @md.creator.prepared_by \ + and @md.creator.prepared_by=~/\S+/ + txt=@md.creator.prepared_by + special_character_escape(txt) + @tp[:prepared_by_f],@tp[:prepared_by_i]='prepared_by, ',"'#{txt}', " + end + if defined? @md.creator.digitized_by \ + and @md.creator.digitized_by=~/\S+/ + txt=@md.creator.digitized_by + special_character_escape(txt) + @tp[:digitized_by_f],@tp[:digitized_by_i]='digitized_by, ',"'#{txt}', " + end + if defined? @md.classify.subject \ + and @md.classify.subject=~/\S+/ # DublinCore 3 - subject (us library of congress, eric or udc, or schema???) + txt=@md.classify.subject #dc + special_character_escape(txt) + @tp[:subject_f],@tp[:subject_i]='subject, ',"'#{txt}', " + end + if defined? @md.notes.description \ + and @md.notes.description=~/\S+/ # DublinCore 4 - description + txt=@md.notes.description #dc + special_character_escape(txt) + @tp[:description_f],@tp[:description_i]='description, ',"'#{txt}', " + end + if defined? @md.classify.subject \ + and @md.classify.subject=~/\S+/ # DublinCore 8 - type (genre eg. report, convention etc) + txt=@md.classify.abstract + special_character_escape(txt) + @tp[:abstract_f],@tp[:abstract_i]='abstract, ',"'#{txt}', " + end + if defined? @md.rights.all \ + and @md.rights.all=~/\S+/ # DublinCore 15 - rights + txt=@md.rights.all #dc + special_character_escape(txt) + @tp[:rights_f],@tp[:rights_i]='rights, ',"'#{txt}', " + end + if defined? @md.date.published \ + and @md.date.published=~/\S+/ # DublinCore 7 - date year-mm-dd + txt=@md.date.published #dc + special_character_escape(txt) + @tp[:date_f],@tp[:date_i]='date, ',"'#{txt}', " + end + if defined? @md.date.created \ + and @md.date.created=~/\S+/ + txt=@md.date.created #dc + special_character_escape(txt) + @tp[:date_created_f],@tp[:date_created_i]='date_created, ',"'#{txt}', " + end + if defined? @md.date.issued \ + and @md.date.issued=~/\S+/ + txt=@md.date.issued #dc + special_character_escape(txt) + @tp[:date_issued_f],@tp[:date_issued_i]='date_issued, ',"'#{txt}', " + end + if defined? @md.date.available \ + and @md.date.available=~/\S+/ + txt=@md.date.available #dc + special_character_escape(txt) + @tp[:date_available_f],@tp[:date_available_i]='date_available, ',"'#{txt}', " + end + if defined? @md.date.modified \ + and @md.date.modified=~/\S+/ + txt=@md.date.modified #dc + special_character_escape(txt) + @tp[:date_modified_f],@tp[:date_modified_i]='date_modified, ',"'#{txt}', " + end + if defined? @md.date.valid \ + and @md.date.valid=~/\S+/ + txt=@md.date.valid #dc + special_character_escape(txt) + @tp[:date_valid_f],@tp[:date_valid_i]='date_valid, ',"'#{txt}', " + end + if defined? @md.title.language \ + and @md.title.language=~/\S+/ + txt=@md.title.language + special_character_escape(txt) + @tp[:language_f],@tp[:language_i]='language, ',"'#{txt}', " + end + if defined? @md.original.language \ + and @md.original.language=~/\S+/ + txt=@md.original.language + special_character_escape(txt) + @tp[:language_original_f],@tp[:language_original_i]='language_original, ',"'#{txt}', " + end + if defined? @md.classify.format \ + and @md.classify.format=~/\S+/ # DublinCore 9 - format (use your mime type) + txt=@md.classify.format #dc + special_character_escape(txt) + @tp[:format_f],@tp[:format_i]='format, ',"'#{txt}', " + end + if defined? @md.classify.identifier \ + and @md.classify.identifier=~/\S+/ # DublinCore 10 - identifier (your identifier, could use urn which is free) + txt=@md.classify.identifier #dc + special_character_escape(txt) + @tp[:identifier_f],@tp[:identifier_i]='identifier, ',"'#{txt}', " + end + if defined? @md.original.source \ + and @md.original.source=~/\S+/ # DublinCore 11 - source (document source) + txt=@md.original.source #dc + special_character_escape(txt) + @tp[:source_f],@tp[:source_i]='source, ',"'#{txt}', " + end + if defined? @md.classify.relation \ + and @md.classify.relation=~/\S+/ # DublinCore 13 - relation + txt=@md.classify.relation #dc + special_character_escape(txt) + @tp[:relation_f],@tp[:relation_i]='relation, ',"'#{txt}', " + end + if defined? @md.classify.coverage \ + and @md.classify.coverage=~/\S+/ # DublinCore 14 - coverage + txt=@md.classify.coverage #dc + special_character_escape(txt) + @tp[:coverage_f],@tp[:coverage_i]='coverage, ',"'#{txt}', " + end + if defined? @md.classify.keywords \ + and @md.classify.keywords=~/\S+/ + txt=@md.classify.keywords + special_character_escape(txt) + @tp[:keywords_f],@tp[:keywords_i]='keywords, ',"'#{txt}', " + end + if defined? @md.notes.comment \ + and @md.notes.comment=~/\S+/ + txt=@md.notes.comments + special_character_escape(txt) + @tp[:comments_f],@tp[:comments_i]='comments, ',"'#{txt}', " + end + if defined? @md.classify.loc \ + and @md.classify.loc=~/\S+/ + txt=@md.classify.loc + special_character_escape(txt) + @tp[:cls_loc_f],@tp[:cls_loc_i]='cls_loc, ',"'#{txt}', " + end + if defined? @md.classify.dewey \ + and @md.classify.dewey=~/\S+/ + txt=@md.classify.dewey + special_character_escape(txt) + @tp[:cls_dewey_f],@tp[:cls_dewey_i]='cls_dewey, ',"'#{txt}', " + end + if defined? @md.classify.pg \ + and @md.classify.pg=~/\S+/ + txt=@md.classify.pg + special_character_escape(txt) + @tp[:cls_pg_f],@tp[:cls_pg_i]='cls_pg, ',"'#{txt}', " + end + if defined? @md.classify.isbn \ + and @md.classify.isbn=~/\S+/ + txt=@md.classify.isbn + special_character_escape(txt) + @tp[:cls_isbn_f],@tp[:cls_isbn_i]='cls_isbn, ',"'#{txt}', " + end + if defined? @md.notes.prefix_a \ + and @md.notes.prefix_a=~/\S+/ + txt=@md.notes.prefix_a + special_character_escape(txt) + @tp[:prefix_a_f],@tp[:prefix_a_i]='prefix_a, ',"'#{txt}', " + end + if defined? @md.notes.prefix_b \ + and @md.notes.prefix_b=~/\S+/ + txt=@md.notes.prefix_b + special_character_escape(txt) + @tp[:prefix_b_f],@tp[:prefix_b_i]='prefix_b, ',"'#{txt}', " + end + if defined? @md.fns \ + and @md.fns=~/\S+/ + txt=@md.fns + special_character_escape(txt) + @tp[:fns_f],@tp[:fns_i]="filename, ","'#{txt}', " + end + if @md.wc_words; txt=@md.wc_words + @tp[:wc_words_f],@tp[:wc_words_i]='wc_words, ',"'#{txt}', " + end + if defined? @md.dgst \ + and @md.dgst.class==Array + txt=@md.dgst[1] + @tp[:dgst_f],@tp[:dgst_i]='dgst, ',"'#{txt}', " + end + if @md.sc_date; txt=@md.sc_date + @tp[:sc_date_f],@tp[:sc_date_i]='sc_date, ',"'#{txt}', " + end + if @md.generated; txt=@md.generated + @tp[:generated_f],@tp[:generated_i]='generated, ',"'#{@txt}', " + end + SiSU_DB_DBI::Test.new(self,@opt).verify #% import title names, filenames (tuple) + t=SiSU_DB_tuple::Load_metadata.new(@conn,@tp,@@id_t,@opt,@file) + tuple=t.tuple + tuple + end + def db_import_documents(dal_array) #% import documents - populate main database table, import into substantive database tables (tuple) + begin + @col[:tid]=@@id_t + @en,@en_ast,@en_pls=[],[],[] + @col[:en_a]=nil + @col[:en_z]=nil + @tuple_array=[] + dal_array.each do |data| + data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') + data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') + data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') + data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') + data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') + data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1') + data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1') + data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1') + data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ') + data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check + @col[:seg]=@@seg + if data.of =~/para|heading|group/ # regular text what of code-blocks grouped text etc. + notedata=data.obj.dup + if data.is=='heading' \ + and data.ln.inspect=~/[123]/ + @col[:lev],txt,@col[:ocn],@col[:lev_an],@col[:ocnd],@col[:ocns],@col[:t_of],@col[:t_is],@col[:node],@col[:parent],@col[:digest_clean],@col[:digest_all]=data.ln,data.obj,data.ocn,data.lv,data.odv,data.osp,data.of,data.is,data.node,data.parent,'','' + @col[:lid]+=1 + if txt =~/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})[*+]?(\d+)\s+.+?(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/ + endnotes(txt).range + @en << endnotes(txt).standard if txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ + @en_ast << endnotes(txt).asterisk if txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ + @en_pls << endnotes(txt).plus if txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_o]}/ + txt=endnotes(txt).clean_text + end + @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_minus + special_character_escape(@col[:body]) + @col[:plaintext]=@col[:body].dup + strip_markup(@col[:plaintext]) + if @en[0]; @en_a,@en_z=@en[0].first,@en[0].last + end + if @en_ast[0]; @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last + end + if @en_pls[0]; @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last + end + t=SiSU_DB_tuple::Load_documents.new(@conn,@col,@opt,@file) + @tuple_array << t.tuple + case @col[:lev] + when /1/; @col[:lv1]+=1 + when /2/; @col[:lv2]+=1 + when /3/; @col[:lv3]+=1 + end + @col[:lev]=@col[:plaintext]=@col[:body]='' + elsif data.is=='heading' \ + and data.ln==4 + @@seg,txt,@col[:ocn],@col[:lev_an],@col[:ocnd],@col[:ocns],@col[:t_of],@col[:t_is],@col[:node],@col[:parent],@col[:digest_clean],@col[:digest_all]=data.name,data.obj,data.ocn,data.lv,data.odv,data.osp,data.of,data.is,data.node,data.parent,'','' + @col[:seg]=@@seg + @col[:lv4]+=1 + @col[:lid]+=1 + @col[:lev]=4 + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::Info_env.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + if txt =~/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})[*+]?(\d+)\s+.+?(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/ + endnotes(txt).range + @en << endnotes(txt).standard if txt =~ /#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ + @en_ast << endnotes(txt).asterisk if txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ + @en_pls << endnotes(txt).plus if txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/ + txt=endnotes(txt).clean_text(@base_url) + end + @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus + special_character_escape(@col[:body]) + @col[:plaintext]=@col[:body].dup + strip_markup(@col[:plaintext]) + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + t=SiSU_DB_tuple::Load_documents.new(@conn,@col,@opt,@file) + @tuple_array << t.tuple + @col[:lev]=@col[:plaintext]=@col[:body]='' + elsif data.is=='heading' and data.ln==5 + txt,@col[:ocn],@col[:lev_an],@col[:ocnd],@col[:ocns],@col[:t_of],@col[:t_is],@col[:node],@col[:parent],@col[:digest_clean],@col[:digest_all]=data.obj,data.ocn,data.lv,data.odv,data.osp,data.of,data.is,data.node,data.parent,'','' + @@seg_full=data.name if data.is=='heading' and data.ln==5 and data.name #check data.name + @@seg ||='' #nil # watch + @col[:seg]=@@seg + @col[:lv5]+=1 + @col[:lid]+=1 + @col[:lev]=5 + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::Info_env.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + if txt =~/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})[*+]?(\d+)\s+.+?(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/ + endnotes(txt).range + @en << endnotes(txt).standard if txt =~ /#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ + @en_ast << endnotes(txt).asterisk if txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ + @en_pls << endnotes(txt).plus if txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/ + txt=endnotes(txt).clean_text(@base_url) + end + @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus + special_character_escape(@col[:body]) + @col[:plaintext]=@col[:body].dup + strip_markup(@col[:plaintext]) + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + t=SiSU_DB_tuple::Load_documents.new(@conn,@col,@opt,@file) + @tuple_array << t.tuple + @col[:lev]=@col[:plaintext]=@col[:body]='' + elsif data.is=='heading' and data.ln==6 + txt,@col[:ocn],@col[:lev_an],@col[:ocnd],@col[:ocns],@col[:t_of],@col[:t_is],@col[:node],@col[:parent],@col[:digest_clean],@col[:digest_all]=data.obj,data.ocn,data.lv,data.odv,data.osp,data.of,data.is,data.node,data.parent,'','' + @@seg_full=data.name if data.is=='heading' and data.ln==6 and data.name #check data.name + @@seg ||='' #nil # watch + @col[:seg]=@@seg + @col[:lv6]+=1 + @col[:lid]+=1 + @col[:lev]=6 + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::Info_env.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + if txt =~/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})[*+]?(\d+)\s+.+?(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/ + endnotes(txt).range + @en << endnotes(txt).standard if txt =~ /#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ + @en_ast << endnotes(txt).asterisk if txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ + @en_pls << endnotes(txt).plus if txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/ + txt=endnotes(txt).clean_text(@base_url) + end + @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus + special_character_escape(@col[:body]) + @col[:plaintext]=@col[:body].dup + strip_markup(@col[:plaintext]) + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + t=SiSU_DB_tuple::Load_documents.new(@conn,@col,@opt,@file) + @tuple_array << t.tuple + @col[:lev]=@col[:plaintext]=@col[:body]='' + else #% regular text + @col[:lid]+=1 + txt='' + txt,@col[:ocn],@col[:ocnd],@col[:ocns],@col[:t_of],@col[:t_is],@col[:node],@col[:parent],@col[:digest_clean],@col[:digest_all]=data.obj,data.ocn,data.odv,data.osp,data.of,data.is,'',data.parent,'','' + @hname=if @col[:seg] \ + and not @col[:seg].to_s.empty? + @@hname=@col[:seg].to_s + else @@hname + end + @env=SiSU_Env::Info_env.new(@md.fns) + @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" + if txt =~/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})[*+]?(\d+)\s+.+?(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/ + endnotes(txt).range + @en << endnotes(txt).standard if txt =~ /#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ + @en_ast << endnotes(txt).asterisk if txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ + @en_pls << endnotes(txt).plus if txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/ + txt=endnotes(txt).clean_text(@base_url) + end + if @sql_type=~/pg/ \ + and txt.size > (document_clean - 1) #% examine pg build & remove limitation + puts "\n\nTOO LARGE (TXT - see error log)\n\n" + open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| + error.puts("\n#{@opt.fns}\nTEXT BODY\n#{@col[:body].size} object #{@col[:ocn]} -> #{@col[:body].slice(0..500)}") + end + txt=%{\n\nLARGE TEXT BLOCK OMITTED\n\n} + end + @en_a,@en_z=@en[0].first,@en[0].last if @en[0] + @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0] + @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] + @col[:body]=if data.is=='table' + SiSU_Format_Shared::CSS_Format.new(@md,data).html_table + elsif defined? data.indent and data.indent =~/[1-9]/ + SiSU_Format_Shared::CSS_Format.new(@md,data).indent(data.indent) + else + SiSU_Format_Shared::CSS_Format.new(@md,data).norm + end + special_character_escape(@col[:body]) + @col[:plaintext]=@col[:body].dup + strip_markup(@col[:plaintext]) + t=SiSU_DB_tuple::Load_documents.new(@conn,@col,@opt,@file) + @tuple_array << t.tuple + @en,@en_ast,@en_pls=[],[],[] + @col[:en_a]=@col[:en_z]=nil + @col[:lev]=@col[:plaintext]=@col[:body]='' + end + if notedata =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ #% import into database endnotes tables + endnote_array=notedata.scan(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/) + endnote_array.each do |inf| + if inf[/#{Mx[:en_a_o]}\d+.+?#{Mx[:en_a_c]}/] + if inf[/#{Mx[:en_a_o]}(\d+)(.+?)#{Mx[:en_a_c]}/] + nr,txt,digest_clean=$1,$2.strip,0 + end + @id_n+=1 + body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt) + special_character_escape(body) + special_character_escape(txt) + strip_markup(txt) + if txt.size > (endnote_clean - 1) + puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" + open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| + error.puts("\n#{@opt.fns}\nENDNOTE\n#{txt.size} object #{@col[:ocn]},#{@col[:ocnd]},#{@col[:ocns]} -> #{txt.slice(0..500)}") + end + txt=%{\n\nLARGE TEXT BLOCK OMITTED\n\n} + end + if txt + en={ :type => 'endnotes', + :id => @id_n, + :lid => @col[:lid], + :nr => nr, + :txt => txt, + :body => body, + :ocn => @col[:ocn], + :ocnd => @col[:ocnd], + :ocns => @col[:ocns], + :id_t => @@id_t, + :hash => digest_clean + } + t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file) + @tuple_array << t.tuple + end + end + end + word_mode=notedata.scan(/\S+/) + end + if notedata =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ #% import into database endnotes tables + endnote_array=notedata.scan(/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/) + endnote_array.each do |inf| + if inf[/#{Mx[:en_b_o]}\*\d+.+?#{Mx[:en_b_c]}/] # dal new endnotes 2003w31/1 + if inf[/#{Mx[:en_b_o]}[*](\d+)(.+?)#{Mx[:en_b_c]}/] # dal new endnotes 2003w31/1 + nr,txt,digest_clean=$1,$2.strip,0 + end + @id_n+=1 + body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt) + special_character_escape(txt) + strip_markup(txt) + if txt.size > (endnote_clean - 1) + puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" + open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| + error.puts("\n#{@opt.fns}\nENDNOTE\n#{txt.size} object #{@col[:ocn]},#{@col[:ocnd]},#{@col[:ocns]} -> #{txt.slice(0..500)}") + end + txt=%{\n\nLARGE TEXT BLOCK OMITTED\n\n} + end + if txt + en={ :type => 'endnotes_asterisk', + :id => @id_n, + :lid => @col[:lid], + :nr => nr, + :txt => txt, + :body => body, + :ocn => @col[:ocn], + :ocnd => @col[:ocnd], + :ocns => @col[:ocns], + :id_t => @@id_t, + :hash => digest_clean + } + t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file) + @tuple_array << t.tuple + end + end + end + word_mode=notedata.scan(/\S+/) + end + if notedata =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/ #% import into database endnotes tables + endnote_array=notedata.scan(/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/) + endnote_array.each do |inf| + if inf[/#{Mx[:en_b_o]}\+\d+.+?#{Mx[:en_b_c]}/] # dal new endnotes 2003w31/1 + if inf[/#{Mx[:en_b_o]}[+](\d+)(.+?)#{Mx[:en_b_c]}/] # dal new endnotes 2003w31/1 + nr,txt,digest_clean=$1,$2.strip,0 + end + @id_n+=1 + body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt) + special_character_escape(txt) + strip_markup(txt) + if txt.size > (endnote_clean - 1) + puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n" + open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| + error.puts("\n#{@opt.fns}\nENDNOTE\n#{txt.size} object #{@col[:ocn]},#{@col[:ocnd]},#{@col[:ocns]} -> #{txt.slice(0..500)}") + end + txt=%{\n\nLARGE TEXT BLOCK OMITTED\n\n} + end + if txt + en={ :type => 'endnotes_plus', + :id => @id_n, + :lid => @col[:lid], + :nr => nr, + :txt => txt, + :body => body, + :ocn => @col[:ocn], + :ocnd => @col[:ocnd], + :ocns => @col[:ocns], + :id_t => @@id_t, + :hash => digest_clean + } + t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file) + @tuple_array << t.tuple + end + end + end + word_mode=notedata.scan(/\S+/) + end + end + end + rescue; SiSU_Errors::Info_error.new($!,$@,@opt.cmd,@opt.fns).error + ensure + end + @tuple_array + end + def endnotes(txt) + @txt=txt + def standard + x=if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/; @txt.scan(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/) + else nil + end + end + def asterisk + x=if @txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/; @txt.scan(/#{Mx[:en_b_o]}[*](\d+).+?#{Mx[:en_b_c]}/) + else nil + end + end + def plus + x=if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/; @txt.scan(/#{Mx[:en_b_o]}[+](\d+).+?#{Mx[:en_b_c]}/) + else nil + end + end + def clean_text(base_url=nil) + if base_url + @txt.gsub!(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/,%{<sup><a href="#{base_url}#_\\1" name="-\\1">\\1</a></sup>}) + @txt.gsub!(/#{Mx[:en_b_o]}([*]\d+).+?#{Mx[:en_b_c]}/,%{<sup><a href="#{base_url}#_\\1" name="-\\1">\\1</a></sup>}) + @txt.gsub!(/#{Mx[:en_b_o]}([+]\d+).+?#{Mx[:en_b_c]}/,%{<sup><a href="#{base_url}#_\\1" name="-\\1">\\1</a></sup>}) + else + @txt.gsub!(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/,'<sup>\1</sup>') + @txt.gsub!(/#{Mx[:en_b_o]}([*]\d+).+?#{Mx[:en_b_c]}/,'<sup>\1</sup>') + @txt.gsub!(/#{Mx[:en_b_o]}([+]\d+).+?#{Mx[:en_b_c]}/,'<sup>\1</sup>') + end + @txt + end + def range + @col[:en_a]=@col[:en_z]=nil + if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|#{Mx[:en_b_o]}([*]\d+).+?#{Mx[:en_b_c]}|#{Mx[:en_b_o]}([+]\d+).+?#{Mx[:en_b_c]}/ + word_array=@txt.scan(/\S+/) + word_array.each do |w| + if w[/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})[*+]?(\d+)\s+.+?(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/] # not tested since change 2003w31 + @col[:en_a]=$1 unless @col[:en_a] + @col[:en_z]=@col[:en_a].dup unless @col[:en_a] + @col[:en_z]=$1 if @col[:en_a] + end + end + end + @col + end + self + end + def db_import_urls(dbi_unit,content) #% import documents OID - populate database + begin + @fnc=content + @env=SiSU_Env::Info_env.new(@opt.fns) + base=@env.url.root + out=@env.path.output + f,u={},{} + if @fnb.empty? \ + or @fnb.nil? + p 'file output path error' #remove + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:plain]}")==true) + f[:txt],u[:txt]='plaintext,', "'#{base}/#{@fnb}/#{@md.fn[:plain]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:toc]}")==true) + f[:html_toc],u[:html_toc]='html_toc,', "'#{base}/#{@fnb}/#{@md.fn[:toc]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:doc]}")==true) + f[:html_doc],u[:html_doc]='html_doc,', "'#{base}/#{@fnb}/#{@md.fn[:doc]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:xhtml]}")==true) + f[:xhtml],u[:xhtml]='xhtml,', "'#{base}/#{@fnb}/#{@md.fn[:xhtml]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:sax]}")==true) + f[:xml_sax],u[:xml_sax]='xml_sax,', "'#{base}/#{@fnb}/#{@md.fn[:sax]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:dom]}")==true) + f[:xml_dom],u[:xml_dom]='xml_dom,', "'#{base}/#{@fnb}/#{@md.fn[:dom]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:odf]}")==true) + f[:odf],u[:odf]='odf,', "'#{base}/#{@fnb}/#{@md.fn[:odf]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:pdf_p]}")==true) + f[:pdf_p],u[:pdf_p]='pdf_p,', "'#{base}/#{@fnb}/#{@md.fn[:pdf_p]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:pdf_l]}")==true) + f[:pdf_l],u[:pdf_l]='pdf_l,', "'#{base}/#{@fnb}/#{@md.fn[:pdf_l]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:concordance]}")==true) + f[:concordance],u[:concordance]='concordance,', "'#{base}/#{@fnb}/#{@md.fn[:concordance]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@opt.fns}.tex")==true) + f[:latex_p],u[:latex_p]='latex_p,', "'#{base}/#{@fnb}/#{@opt.fns}.tex'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@opt.fns}.landscape.tex")==true) + f[:latex_l],u[:latex_l]='latex_l,', "'#{base}/#{@fnb}/#{@opt}.fns}.landscape.tex'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:digest]}")==true) + f[:digest],u[:digest]='digest,', "'#{base}/#{@fnb}/#{@md.fn[:digest]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@md.fn[:manifest]}")==true) #revisit, was to be text, this is html + f[:manifest],u[:manifest]='manifest,', "'#{base}/#{@fnb}/#{@md.fn[:manifest]}'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@opt.fns}.meta")==true) + f[:markup],u[:markup]='markup,', "'#{base}/#{@fnb}/#{@opt.fns}.meta'," + end + if @opt.cmd !~/e/ \ + or (@opt.cmd=~/e/ and FileTest.file?("#{out}/#{@fnb}/#{@opt.fns}.tgz")==true) + f[:sisupod],u[:sisupod]='sisupod,', "'#{base}/#{@fnb}/#{@opt.fns}.tgz'," + end + t=SiSU_DB_tuple::Load_urls.new(@conn,f,u,@@id_t,@opt,@file) + tuple=t.tuple + rescue; SiSU_Errors::Info_error.new($!,$@,@opt.cmd,@opt.fns).error + ensure + end + tuple + end + end +end +__END__ |