aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRalph Amissah <ralph@amissah.com>2008-09-03 22:25:03 -0400
committerRalph Amissah <ralph@amissah.com>2008-09-03 22:36:10 -0400
commit2a738f528cb87793ff7f8312099666af1e21f44c (patch)
treee3d58f4118689a0d353dc04775652c83fc40b65e
parenthtml segments; sql db field lengths increased (diff)
xml character encoding adjusted; xml image match; odf issue with '@' symbol in url
-rw-r--r--lib/sisu/v0/character_encoding.rb4
-rw-r--r--lib/sisu/v0/odf.rb7
-rw-r--r--lib/sisu/v0/param.rb42
-rw-r--r--lib/sisu/v0/shared_xml.rb29
-rw-r--r--lib/sisu/v0/xhtml.rb2
-rw-r--r--lib/sisu/v0/xml.rb2
-rw-r--r--lib/sisu/v0/xml_dom.rb2
7 files changed, 61 insertions, 27 deletions
diff --git a/lib/sisu/v0/character_encoding.rb b/lib/sisu/v0/character_encoding.rb
index 60c2f335..aa856cdd 100644
--- a/lib/sisu/v0/character_encoding.rb
+++ b/lib/sisu/v0/character_encoding.rb
@@ -374,7 +374,9 @@ module SiSU_character_encode
['ü', 252, '374', '\303\274', '&#252;', '&uuml;', 'ü', '\"{u}', '', 'Small u, umlaut ü' ],
['ý', 253, '375', '\303\275', '&#253;', '&yacute;', 'ý', '', '', 'Small y, acute accent ý' ],
['þ', 254, '376', '\303\276', '&#254;', '&thorn;', 'þ', '', '', 'Small thorn, Icelandic þ' ],
- ['ÿ', 255, '377', '\303\277', '&#255;', '&yuml;', 'ÿ', '', '', 'Smally y, umlaut ÿ' ]
+ ['ÿ', 255, '377', '\303\277', '&#255;', '&yuml;', 'ÿ', '', '', 'Smally y, umlaut ÿ' ],
+ ['∝', , '', '', '&#8733;', '&prop;', '∝', '', '', 'proportional to U+221D (8733) ∝' ],
+ ['∞', , '', '', '&#8734;', '&infin;', '∞', '', '', 'infinity U+221E (8734) ∞' ],
]
end
end
diff --git a/lib/sisu/v0/odf.rb b/lib/sisu/v0/odf.rb
index 6b1491c2..bf23f91f 100644
--- a/lib/sisu/v0/odf.rb
+++ b/lib/sisu/v0/odf.rb
@@ -267,7 +267,7 @@ module SiSU_ODF
#para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([;.,]?(?:\s|$))/, also works
#%{#{@url_brace.xml_open}<text:a xlink:type="simple" xlink:href="\\1">\\1</text:a>#{@url_brace.xml_close}\\2}) #http ftp matches with decoration
para.gsub!(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+)/,
- %{#{@url_brace.xml_open}<text:a xlink:type="simple" xlink:href="mailto:\\1">\\1</text:a>#{@url_brace.xml_close}})
+ %{#{@url_brace.xml_open}<text:a xlink:type="simple" xlink:href="mailto:\\1">\\1</text:a>#{@url_brace.xml_close}}) if para !~/http:\/\// # improve upon, document crash where url contains '@' symbol
para=case para
when /^#{Mx[:pa_o]}:i([1-9])#{Mx[:pa_c]}/m
m=$1
@@ -443,6 +443,7 @@ module SiSU_ODF
#para.gsub!(/&lt;(~\d+;(?:\w|[0-6]:)\d+;\w\d+)&gt;&lt;(#@dp:#@dp)&gt;/,'<\1><\2>')
para='' if para =~/#{Mx[:lv_o]}\d+:.*?#{Mx[:lv_c]}.+?#{Mx[:pa_non_object_dummy_heading]}/
para_array=[]
+ para.gsub!(/</,'&lt;'); para.gsub!(/>/,'&gt;')
word=para.scan(/\S+|\n/)
if word
word.each do |w| # _ - / # | : ! ^ ~
@@ -487,8 +488,10 @@ module SiSU_ODF
para.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'<text:span text:style-name="T3">\1</text:span>')
para.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'<text:span text:style-name="T4">\1</text:span>')
para.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'<text:span text:style-name="T5">\1</text:span>')
- para.gsub!(/`/,"'")
+ para.gsub!(/[`’]/,"'")
para.gsub!(/­/u,'-')
+ para.gsub!(/ /u, ' ') # space identify
+ para.gsub!(/ /u, ' ') # space identify
para.gsub!(/·/u,'*')
para.gsub!(/[“”]/u,'""')
para.gsub!(/[­–—]/u,'-') #— – chk
diff --git a/lib/sisu/v0/param.rb b/lib/sisu/v0/param.rb
index 3cfbf1e5..b211f5c1 100644
--- a/lib/sisu/v0/param.rb
+++ b/lib/sisu/v0/param.rb
@@ -368,32 +368,32 @@ module SiSU_Param
@dc_date_modified=date
@date_modified_scheme='scheme="ISO-8601"' if date =~/\d{4}-\d{2}-\d{2}/
end
- when /^(?:0~type|@type:)\s+(.+?)$/m; @dc_type=$1 #% metainfo DC
- when /^(?:0~format|@format:)\s+(.+?)$/m; @dc_format=$1 #% metainfo DC
- #when /^(?:0~identifier|@identifier:)\s+(.+?)$/m; @dc_identifier=$1 #% metainfo DC
- when /^(?:0~source|@source:)\s+(.+?)$/m; @dc_source=$1 #% metainfo DC
- when /^(?:0~language(?:\.document)?|@language(?:\.document)?:)\s+(.+?)$/m #% metainfo DC
+ when /^(?:0~type|@type:)\s+(.+?)$/m; @dc_type=$1 #% metainfo DC
+ when /^(?:0~format|@format:)\s+(.+?)$/m; @dc_format=$1 #% metainfo DC
+ #when /^(?:0~identifier|@identifier:)\s+(.+?)$/m; @dc_identifier=$1 #% metainfo DC
+ when /^(?:0~source|@source:)\s+(.+?)$/m; @dc_source=$1 #% metainfo DC
+ when /^(?:0~language(?:\.document)?|@language(?:\.document)?:)\s+(.+?)$/m #% metainfo DC
x=$1.strip
lang=SiSU_Env::Standardise_language.new(x.dup)
@dc_language[:code]=lang.code
@dc_language[:name]=lang.title
- when /^(?:0~language\.original|@language\.original:)\s+(.+?)$/m #% metainfo DC
+ when /^(?:0~language\.original|@language\.original:)\s+(.+?)$/m #% metainfo DC
x=$1.strip
lang=SiSU_Env::Standardise_language.new(x.dup)
@language_original[:name]=lang.title
- when /^(?:0~relation|@relation:)\s+(.+?)$/m; @dc_relation=$1 #% metainfo DC
- when /^(?:0~coverage|@coverage:)\s+(.+?)$/m; @dc_coverage=$1 #% metainfo DC
- when /^(?:0~rights|@rights:)\s+(.+?)$/m; @dc_rights=$1 #% metainfo DC copyright, public domain, copyleft, creative commons, etc.
- when /^(?:0~papersize|@papersize:)\s+(.+?)$/m #% metainfo DC
+ when /^(?:0~relation|@relation:)\s+(.+?)$/m; @dc_relation=$1 #% metainfo DC
+ when /^(?:0~coverage|@coverage:)\s+(.+?)$/m; @dc_coverage=$1 #% metainfo DC
+ when /^(?:0~rights|@rights:)\s+(.+?)$/m; @dc_rights=$1.gsub(/<(?:\/\s*)?br(?:\s*\/)?>/,Mx[:br_line]) #% metainfo DC copyright, public domain, copyleft, creative commons, etc.
+ when /^(?:0~papersize|@papersize:)\s+(.+?)$/m #% metainfo DC
l=$1
if @mod.inspect !~/--papersize[=-]\S+/
l=determine_papersize(l.dup)
@papersize=l
end
- when /^(?:0~keywords?|@keywords?:?)\s+(.+?)$/m; @keywords=$1 #% metainfo DC
- when /^(?:0~comments?|@comments?:?)\s+(.+?)$/m; @comments=$1 #% metainfo DC
- when /^(?:0~abstract|@abstract)\s+(.+?)$/m; @abstract=$1 #% metainfo DC
- when /^(?:0~tags?|@tags?:)\s+\S/m #% metainfo
+ when /^(?:0~keywords?|@keywords?:?)\s+(.+?)$/m; @keywords=$1 #% metainfo DC
+ when /^(?:0~comments?|@comments?:?)\s+(.+?)$/m; @comments=$1.gsub(/<(?:\/\s*)?br(?:\s*\/)?>/,Mx[:br_line]) #% metainfo DC
+ when /^(?:0~abstract|@abstract)\s+(.+?)$/m; @abstract=$1.gsub(/<(?:\/\s*)?br(?:\s*\/)?>/,Mx[:br_line]) #% metainfo DC
+ when /^(?:0~tags?|@tags?:)\s+\S/m #% metainfo
tags=para.match(/^(?:0~tags?|@tags?:)\s+(.+)\Z/m)[1]
tags.split(/,|$/).each do |tag|
tag.strip!
@@ -403,18 +403,18 @@ module SiSU_Param
tag_a=tag_a.split(/:/).join('][')
@tag_a << tag_a
end
- when /^(?:0~catalogue|@catalogue:)\s+(.+)?$/m #% metainfo
+ when /^(?:0~catalogue|@catalogue:)\s+(.+)?$/m #% metainfo
m=$1
@cls_pg=m.match(/pg=(\S+)/)[1] if m =~/pg=/
@cls_isbn=m.match(/isbn=(\S+)/)[1] if m =~/isbn=/
@cls_dewey=m.match(/dewey=(\S+)/)[1] if m =~/dewey=/
@cls_loc=m.match(/loc=(\S+)/)[1] if m =~/loc=/
- when /^(?:0~class(?:ify)?_loc|@class(?:ify)?_loc:)\s+(.+?)$/m; @cls_loc=$1 #% metainfo
- when /^(?:0~class(?:ify)?_dewey|@class(?:ify)?_dewey:)\s+(.+?)$/m; @cls_dewey=$1 #% metainfo
- when /^(?:0~class(?:ify)?_pg|@class(?:ify)?_pg)\s+(.+?)$/m; @cls_pg=$1 #% metainfo
- when /^(?:0~(?:class(?:ify)?_)?isbn|@(?:class(?:ify)?_)?isbn)\s+(\S+?)$/m; @cls_isbn=$1 #% metainfo
- when /^(?:0~images?|@images?:)\s+(.+?)$/m; @image=$1 #% processing
- when /^(?:0~(?:toc|structure)|@(?:toc|structure):)\s+(.+?)\Z/m #% processing
+ when /^(?:0~class(?:ify)?_loc|@class(?:ify)?_loc:)\s+(.+?)$/m; @cls_loc=$1 #% metainfo
+ when /^(?:0~class(?:ify)?_dewey|@class(?:ify)?_dewey:)\s+(.+?)$/m; @cls_dewey=$1 #% metainfo
+ when /^(?:0~class(?:ify)?_pg|@class(?:ify)?_pg)\s+(.+?)$/m; @cls_pg=$1 #% metainfo
+ when /^(?:0~(?:class(?:ify)?_)?isbn|@(?:class(?:ify)?_)?isbn)\s+(\S+?)$/m; @cls_isbn=$1 #% metainfo
+ when /^(?:0~images?|@images?:)\s+(.+?)$/m; @image=$1 #% processing
+ when /^(?:0~(?:toc|structure)|@(?:toc|structure):)\s+(.+?)\Z/m #% processing
doc_toc_str=$1
@toc=doc_toc_str.split(/;\s*/)
@toc=[ @toc ] if @toc == String
diff --git a/lib/sisu/v0/shared_xml.rb b/lib/sisu/v0/shared_xml.rb
index 9203f0df..228a5c14 100644
--- a/lib/sisu/v0/shared_xml.rb
+++ b/lib/sisu/v0/shared_xml.rb
@@ -166,6 +166,8 @@ module SiSU_XML_munge
#¢£¥§©ª«®°±²³µ¶¹º»¼½¾×÷
##para.gsub!(//, '&#;')
##para.gsub!(//, '&;')
+ para.gsub!(/</u, '&#60;') # '&lt;' # &#060;
+ para.gsub!(/>/u, '&#62;') # '&gt;' # &#062;
para.gsub!(/¢/u, '&#162;') # '&cent;' # &#162;
para.gsub!(/£/u, '&#163;') # '&pound;' # &#163;
para.gsub!(/¥/u, '&#165;') # '&yen;' # &#165;
@@ -250,10 +252,23 @@ module SiSU_XML_munge
para.gsub!(/ü/u, '&#253;') # '&uuml;' # &#253;
para.gsub!(/þ/u, '&#254;') # '&thorn;' # &#254;
para.gsub!(/ÿ/u, '&#255;') # '&yuml;' # &#255;
+ para.gsub!(/‘/u, '&#8216;') # '&lsquo;' # &#8216;
+ para.gsub!(/’/u, '&#8217;') # '&rsquo;' # &#8217;
+ para.gsub!(/–/u, '&#8211;') # &ndash; # &#8211;
+ para.gsub!(/—/u, '&#8212;') # &mdash; # &#8212;
+ para.gsub!(/∝/u, '&#8733;') # &prop; # &#8733;
+ para.gsub!(/∞/u, '&#8734;') # &infin; # &#8734;
+ para.gsub!(/™/u, '&#8482;') # &trade; # &#8482;
+ para.gsub!(/✠/u, '&#8224;') # &dagger; # &#8224; incorrect replacement! †
+ para.gsub!(/ /u, ' ') # space identify
+ para.gsub!(/ /u, ' ') # space identify
end
end
def html(para='')
if @sys.locale =~/utf-?8/i # instead ucs for utf8 #require 'iconv' ? Iñtërnâtiônàlizætiøn
+ para.gsub!(/ /u, ' ') # space identify
+ para.gsub!(/ /u, ' ') # space identify
+ else
para.gsub!(/¢/u, '&cent;') # &#162;
para.gsub!(/£/u, '&pound;') # &#163;
para.gsub!(/¥/u, '&yen;') # &#165;
@@ -338,6 +353,16 @@ module SiSU_XML_munge
para.gsub!(/ü/u, '&uuml;') # &#253;
para.gsub!(/þ/u, '&thorn;') # &#254;
para.gsub!(/ÿ/u, '&yuml;') # &#255;
+ para.gsub!(/‘/u, '&#lsquo;') # &lsquo; # &#8216;
+ para.gsub!(/’/u, '&#rsquo;') # &rsquo; # &#8217;
+ para.gsub!(/–/u, '&ndash;') # &ndash; # &#8211;
+ para.gsub!(/—/u, '&mdash;') # &mdash; # &#8212;
+ para.gsub!(/∝/u, '&prop;') # &prop; # &#8733;
+ para.gsub!(/∞/u, '&infin;') # &infin; # &#8734;
+ para.gsub!(/™/u, '&trade;') # &trade; # &#8482;
+ para.gsub!(/✠/u, '&dagger;') # &dagger; # &#8224; incorrect replacement †
+ para.gsub!(/ /u, ' ') # space identify
+ para.gsub!(/ /u, ' ') # space identify
end
end
self
@@ -381,6 +406,10 @@ module SiSU_XML_munge
%{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1" width="\\2" height="\\3" />[\\1] \\4})
para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?#{Mx[:lnk_c]}(https?:\/\/\S+)/,
%{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1"/>\\1})
+ para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))\s+(\d+)x(\d+)(\s+[^}]+)?#{Mx[:lnk_c]}image/,
+ %{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1" width="\\2" height="\\3" />[\\1] \\4})
+ para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?#{Mx[:lnk_c]}image/,
+ %{<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\1"/>\\1})
para.gsub!(/(^|#{Mx[:gl_c]}|\s)#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}(https?:\/\/[^"><]+?)([,.:;"><]?(?=\s|$))/,
'\1<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\3">\2</link>\4') #watch, compare html_tune
para.gsub!(/(^|#{Mx[:gl_c]}|\s)((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([;.,]?(?=\s|$))/,
diff --git a/lib/sisu/v0/xhtml.rb b/lib/sisu/v0/xhtml.rb
index 95da5d2d..5f99cacf 100644
--- a/lib/sisu/v0/xhtml.rb
+++ b/lib/sisu/v0/xhtml.rb
@@ -267,8 +267,8 @@ WOK
(0..6).each { |x| @cont[x]=@level[x]=false }
(4..6).each { |x| @xml_contents_close[x]='' }
data.each do |para|
- para=@trans.markup(para)
@trans.char_enc.utf8(para) if @sys.locale =~/utf-?8/i #% utf8
+ para=@trans.markup(para)
if para =~/^#{Rx[:meta]}\s*.+?$/ # for headers
d_meta=SiSU_text_utils::Header_scan.new(@md,para).meta
if d_meta; xml_head(d_meta)
diff --git a/lib/sisu/v0/xml.rb b/lib/sisu/v0/xml.rb
index 4826a503..49c71b88 100644
--- a/lib/sisu/v0/xml.rb
+++ b/lib/sisu/v0/xml.rb
@@ -291,8 +291,8 @@ WOK
(0..6).each { |x| @cont[x]=@level[x]=false }
(4..6).each { |x| @xml_contents_close[x]='' }
data.each do |para|
- para=@trans.markup(para)
@trans.char_enc.utf8(para) if @sys.locale =~/utf-?8/i #% utf8
+ para=@trans.markup(para)
if para =~/^#{Rx[:meta]}\s*.+?$/ # for headers
d_meta=SiSU_text_utils::Header_scan.new(@md,para).meta
if d_meta; xml_head(d_meta)
diff --git a/lib/sisu/v0/xml_dom.rb b/lib/sisu/v0/xml_dom.rb
index b2bc0de7..13aed504 100644
--- a/lib/sisu/v0/xml_dom.rb
+++ b/lib/sisu/v0/xml_dom.rb
@@ -349,8 +349,8 @@ WOK
(0..6).each { |x| @cont[x]=@level[x]=false }
(4..6).each { |x| @xml_contents_close[x]='' }
data.each do |para|
- para=@trans.markup(para)
@trans.char_enc.utf8(para) if @sys.locale =~/utf-?8/i #% utf8
+ para=@trans.markup(para)
if para =~/^#{Rx[:meta]}\s*(.+?)$/ # for headers
d_meta=SiSU_text_utils::Header_scan.new(@md,para).meta
if d_meta; xml_head(d_meta)