From e973365c4b74be2b2cff9be970ccba5928dbe368 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Wed, 22 May 2019 10:50:33 -0400 Subject: 0.7.3 start to look at document harvest (initial stub) --- org/default_regex.org | 13 ++- org/doc_reform.org | 227 ++++++++++++++++++++++++++++++++++++-------- org/dr_build_scaffold.org | 1 - org/meta_conf_make_meta.org | 218 +++++++++++++++++++++++++----------------- 4 files changed, 329 insertions(+), 130 deletions(-) (limited to 'org') diff --git a/org/default_regex.org b/org/default_regex.org index 2958027..6d17f0c 100644 --- a/org/default_regex.org +++ b/org/default_regex.org @@ -51,6 +51,7 @@ static template DocReformRgxInit() { #+BEGIN_SRC d /+ misc +/ static true_dollar = ctRegex!(`\$`, "gm"); +static sep = ctRegex!(`␣`, "gm"); static flag_action = ctRegex!(`^(--[a-z][a-z0-9-]+)$`); static flag_action_str = ctRegex!(` (--[a-z][a-z0-9-]+)`); static within_quotes = ctRegex!(`"(.+?)"`, "m"); @@ -106,7 +107,7 @@ static make_simple_substitutions_d = ctRegex!(`(?P\S.+?),\s+(?P.+)`,"i"); static toml_header_meta_title = ctRegex!(`^\s*(title\s*=\s*"|\[title\])`, "m"); #+END_SRC @@ -368,6 +369,16 @@ static bi_sub_terms_plus_object_number_offset_split = ctRegex!(`\s*\|\s*`); static bi_term_and_object_numbers_match = ctRegex!(`^(.+?)\+(\d+)`); #+END_SRC +** topic register split (document classify) + +#+name: meta_rgx +#+BEGIN_SRC d +static topic_register_main_terms_split = ctRegex!(`\s*;\s*`); +static topic_register_main_term_plus_rest_split = ctRegex!(`\s*:\s*`); +static topic_register_sub_terms_split = ctRegex!(`\s*\|\s*`); +static topic_register_multiple_sub_terms_split = ctRegex!(`␣([^|␣]+(?:\|[^|␣]+)+)`); +#+END_SRC + ** language codes :language:codes: #+name: meta_rgx diff --git a/org/doc_reform.org b/org/doc_reform.org index 12eb46a..7bb8029 100644 --- a/org/doc_reform.org +++ b/org/doc_reform.org @@ -27,7 +27,7 @@ struct Version { int minor; int patch; } -enum _ver = Version(0, 7, 2); +enum _ver = Version(0, 7, 3); #+END_SRC ** compilation restrictions (supported compilers) @@ -71,8 +71,7 @@ version (Posix) { module doc_reform.sisu_document_parser; import doc_reform.conf.compile_time_info, - doc_reform.meta.metadoc, - doc_reform.meta.metadochead; + doc_reform.meta.metadoc; <> import std.algorithm; import std.parallelism; @@ -86,6 +85,7 @@ homepage "http://sisudoc.org" +/ void main(string[] args) { <> + <> <> <> if (_manifests.length > 1 // _manifests[0] initialized dummy element @@ -117,6 +117,38 @@ void main(string[] args) { } } } + if (_opt_action.verbose + && harvests.length > 1 + ) { + auto min_repeat_number = 42; + foreach(doc_harvest; harvests) { + auto char_repeat_number = (doc_harvest.title.length + + doc_harvest.author.length + 16); + char_repeat_number = (char_repeat_number > min_repeat_number) + ? char_repeat_number + : min_repeat_number; + writefln( + "%s\n\"%s\", %s%s", + mkup.repeat_character_by_number_provided("-", char_repeat_number), + doc_harvest.title, + doc_harvest.author, + (doc_harvest.date_published.length > 0) ? " (" ~ doc_harvest.date_published ~ ")" : "", + ); + string[] _topic_arr; + foreach(topic; doc_harvest.topic_register_arr.sort) { + foreach (i, _top; topic.split(mkup.sep)) { + writeln(" ", (" ".repeat(i).join), "- ", _top); + } + } + } + string[] _author_date_title; + foreach(doc_harvest; harvests) { + _author_date_title ~= doc_harvest.author_date_title; + } + foreach(_adt; _author_date_title.sort) { + writeln(_adt); + } + } } #+END_SRC @@ -193,6 +225,7 @@ import import doc_reform.meta, doc_reform.meta.metadoc_summary, + doc_reform.meta.metadoc_harvest, doc_reform.meta.metadoc_from_src, doc_reform.meta.conf_make_meta_structs, doc_reform.meta.conf_make_meta_toml, @@ -266,7 +299,25 @@ mixin outputHub; **** init :init: -#+NAME: doc_reform_args +#+NAME: doc_reform_init +#+BEGIN_SRC d +struct Harvest { + string title = ""; + string author = ""; + string author_date_title = ""; + string date_published = ""; + string[] topic_register_arr = [""]; + string html_seg_toc = ""; + string html_scroll = ""; + string epub = ""; +} +Harvest harvested; +Harvest[] harvests; +#+END_SRC + +**** args :args: + +#+NAME: doc_reform_init #+BEGIN_SRC d string flag_action; string arg_unrecognized; @@ -276,7 +327,7 @@ static auto rgx = Rgx(); *** scope (run complete) :scope: -#+NAME: doc_reform_args +#+NAME: doc_reform_init #+BEGIN_SRC d scope(success) { writefln( @@ -311,6 +362,9 @@ bool[string] opts = [ "debug" : false, "digest" : false, "epub" : false, + "harvest" : false, + "harvest-authors" : false, + "harvest-topics" : false, "html" : false, "html-seg" : false, "html-scroll" : false, @@ -366,6 +420,9 @@ auto helpInfo = getopt(args, "debug", "--debug", &opts["debug"], "digest", "--digest hash digest for each object", &opts["digest"], "epub", "--epub process epub output", &opts["epub"], + "harvest", "--harvest extract info on authors & topics from document header metadata", &opts["harvest"], + "harvest-authors", "--harvest-authors extract info on authors from document header metadata", &opts["harvest-authors"], + "harvest-topics", "--harvest-topics extract info on topics from document header metadata", &opts["harvest-topics"], "html", "--html process html output", &opts["html"], "html-seg", "--html-seg process html output", &opts["html-seg"], "html-scroll", "--html-seg process html output", &opts["html-scroll"], @@ -449,6 +506,22 @@ struct OptActions { bool epub() { return opts["epub"]; } + bool harvest() { + bool _is = ( + opts["harvest"] + || opts["harvest-authors"] + || opts["harvest-topics"] + ) + ? true + : false; + return _is; + } + bool harvest_authors() { + return opts["harvest-authors"]; + } + bool harvest_topics() { + return opts["harvest-topics"]; + } bool html() { bool _is; if ( opts["html"] || opts["html-seg"] || opts["html-scroll"]) @@ -886,7 +959,29 @@ if ((doc_matters.opt.action.debug_do) #+BEGIN_SRC d /+ ↓ debugs +/ if (doc_matters.opt.action.verbose) { - DocReformAbstractionSummary!()(doc_abstraction, doc_matters); + DocReformMetaDocSummary!()(doc_abstraction, doc_matters); +} +#+END_SRC + +**** 0. abstraction harvest :abstraction:harvest: +- abstraction harvest + +#+NAME: doc_reform_each_file_do_debugs_checkdoc +#+BEGIN_SRC d +if (doc_matters.opt.action.harvest) { + if (doc_matters.opt.action.harvest_authors) { + } + if (doc_matters.opt.action.harvest_topics) { + } + Harvest[] DocReformMetaDocHarvests()( + Harvest harvested, + Harvest[] harvests, + ) { + harvests ~= harvested; + return harvests; + } + harvested = DocReformMetaDocHarvest!()(doc_matters, harvested); + harvests = DocReformMetaDocHarvests!()(harvested, harvests); } #+END_SRC @@ -947,34 +1042,6 @@ break; // terminate, stop #+END_SRC * 2. pre-processing -** Harvest _get document head_ for harvest (separate thread) -*** 0 module template -- harvest, get document head - -#+BEGIN_SRC d :tangle "../src/doc_reform/meta/metadochead.d" -module doc_reform.meta.metadochead; -template DocReformHarvestGetFromHead() { // TODO - <> - <> - enum headBody { header, body_content, insert_file_list, image_list } - enum makeMeta { make, meta } - static auto rgx = Rgx(); - auto DocReformHarvestGetFromHead(E,O,M)( // TODO - E _env, - O _opt_action, - M _manifest - ){ - <> - <> - <> - <> - auto t = tuple(doc_matters_shared, doc_matters_abridged_collected); - static assert(t.length==2); - return t; - } -} -#+END_SRC - ** Output _document abstraction functions_ :module:doc_reform:abstraction: *** 0 module template - abstraction template @@ -1294,13 +1361,13 @@ if ((_opt_action.debug_do) #+END_SRC * 3. document abstraction _summary_ :module:doc_reform:metadoc_summary: -** 0. module template +** 0. module template metadoc summary - document summary from abstraction #+BEGIN_SRC d :tangle "../src/doc_reform/meta/metadoc_summary.d" module doc_reform.meta.metadoc_summary; -template DocReformAbstractionSummary() { - void DocReformAbstractionSummary(S,T)( +template DocReformMetaDocSummary() { + void DocReformMetaDocSummary(S,T)( const S doc_abstraction, T doc_matters, ) { @@ -1308,7 +1375,7 @@ template DocReformAbstractionSummary() { mixin InternalMarkup; <> if (doc_matters.opt.action.verbose) { - <> + <> } } } @@ -1344,7 +1411,7 @@ auto markup = InlineMarkup(); ** (last ocn) -#+name: meta_metadoc_summary +#+name: meta_metadoc_summary_document #+BEGIN_SRC d string[string] check = [ "last_object_number" : "NA [debug \"checkdoc\" not run]", @@ -1372,9 +1439,9 @@ foreach (k; doc_matters.has.keys_seq.seg) { } #+END_SRC -** summary +** document summary -#+name: meta_metadoc_summary +#+name: meta_metadoc_summary_document #+BEGIN_SRC d auto min_repeat_number = 66; auto char_repeat_number = (doc_matters.conf_make_meta.meta.title_full.length @@ -1439,6 +1506,84 @@ writefln( ); #+END_SRC +** 0. module template metadoc harvest + +#+BEGIN_SRC d :tangle "../src/doc_reform/meta/metadoc_harvest.d" +module doc_reform.meta.metadoc_harvest; +template DocReformMetaDocHarvest() { + auto DocReformMetaDocHarvest(T,H)( + T doc_matters, + H harvest, + ) { + <> + mixin InternalMarkup; + <> + <> + } +} +#+END_SRC + +** init +*** imports + +#+name: metadoc_harvest_imports +#+BEGIN_SRC d +import + doc_reform.meta.defaults, + doc_reform.meta.rgx; +import + std.array, + std.exception, + std.regex, + std.stdio, + std.string, + std.traits, + std.typecons, + std.uni, + std.utf, + std.conv : to; +#+END_SRC + +*** initialize :report: + +#+name: metadoc_harvest_initialize +#+BEGIN_SRC d +auto markup = InlineMarkup(); +#+END_SRC + +** harvest summary + +#+name: meta_metadoc_harvest_summary +#+BEGIN_SRC d +auto min_repeat_number = 66; +auto char_repeat_number = (doc_matters.conf_make_meta.meta.title_full.length + + doc_matters.conf_make_meta.meta.creator_author.length + 4); +char_repeat_number = (char_repeat_number > min_repeat_number) +? char_repeat_number +: min_repeat_number; +writefln( + "%s\n\"%s\", %s\n%s\n%s\n%s", + markup.repeat_character_by_number_provided("-", char_repeat_number), + doc_matters.conf_make_meta.meta.title_full, + doc_matters.conf_make_meta.meta.creator_author, + doc_matters.src.filename, + doc_matters.conf_make_meta.meta.classify_topic_register_arr, + markup.repeat_character_by_number_provided("-", char_repeat_number), +); +#+END_SRC + +** return harvest + +#+name: meta_metadoc_harvest +#+BEGIN_SRC d +harvest.title = doc_matters.conf_make_meta.meta.title_full; +harvest.author = doc_matters.conf_make_meta.meta.creator_author; +harvest.author_date_title = doc_matters.conf_make_meta.meta.author_date_title; +harvest.date_published = doc_matters.conf_make_meta.meta.date_published; +harvest.topic_register_arr = doc_matters.conf_make_meta.meta.classify_topic_register_arr; +return harvest; +#+END_SRC + * __END__ dev notes diff --git a/org/dr_build_scaffold.org b/org/dr_build_scaffold.org index 3a209a9..1bb8884 100644 --- a/org/dr_build_scaffold.org +++ b/org/dr_build_scaffold.org @@ -1132,7 +1132,6 @@ revision = head !tangle !*.org !*.d -!*.txt !*.rb !conf.sdl !org diff --git a/org/meta_conf_make_meta.org b/org/meta_conf_make_meta.org index 853134f..79ee9e7 100644 --- a/org/meta_conf_make_meta.org +++ b/org/meta_conf_make_meta.org @@ -260,7 +260,9 @@ struct MetaComposite { string classify_loc; string classify_subject; string classify_topic_register; + string[] classify_topic_register_arr; string creator_author; + string creator_author_surname_fn; string creator_author_email; string creator_illustrator; string creator_translator; @@ -302,6 +304,7 @@ struct MetaComposite { string title_short; string title_sub; string title_subtitle; + string author_date_title; } #+END_SRC @@ -431,6 +434,8 @@ JSONValue config_jsonstr = `{ module doc_reform.meta.conf_make_meta_json; static template contentJSONtoDocReformStruct() { import + std.algorithm, + std.array, std.exception, std.regex, std.stdio, @@ -442,6 +447,7 @@ static template contentJSONtoDocReformStruct() { import doc_reform.meta.conf_make_meta_structs, doc_reform.meta.conf_make_meta_json, + doc_reform.meta.defaults, doc_reform.meta.rgx; ConfCompositePlus _struct_composite; auto contentJSONtoDocReformStruct(C,J)(C _struct_composite, J _json, string _identifier) { @@ -809,6 +815,108 @@ if ("search" in _json.object) { #+name: json_objects #+BEGIN_SRC d /+ meta ------------------------------------------------------------------- +/ +if (_struct_composite.meta.creator_author.empty) { + if ("creator" in _json.object) { + if ("author" in _json.object["creator"] + && (_json.object["creator"]["author"].type().to!string == "string") + ) { + _struct_composite.meta.creator_author = _json.object["creator"]["author"].str; + } + if ("email" in _json.object["creator"] + && (_json.object["creator"]["email"].type().to!string == "string") + ) { + _struct_composite.meta.creator_author_email = _json.object["creator"]["email"].str; + } + if ("illustrator" in _json.object["creator"] + && (_json.object["creator"]["illustrator"].type().to!string == "string") + ) { + _struct_composite.meta.creator_illustrator = _json.object["creator"]["illustrator"].str; + } + if ("translator" in _json.object["creator"] + && (_json.object["creator"]["translator"].type().to!string == "string") + ) { + _struct_composite.meta.creator_translator = _json.object["creator"]["translator"].str; + } + } + string[] authors_arr; + string[][string] authors_hash_arr = [ "first" : [], "last" : [], "full" : [], "last_first" : [], "as_input" : [] ]; + string[] authors_raw_arr + = _struct_composite.meta.creator_author.split(rgx.arr_delimiter); + auto _lastname = appender!(char[])(); + foreach (author_raw; authors_raw_arr) { + authors_arr ~= author_raw.replace(rgx.raw_author_munge, "$2 $1"); + authors_hash_arr["first"] ~= author_raw.replace(rgx.raw_author_munge, "$2"); + authors_hash_arr["last"] ~= author_raw.replace(rgx.raw_author_munge, "$1"); + authors_hash_arr["full"] ~= author_raw.replace(rgx.raw_author_munge, "$2 $1"); + authors_hash_arr["as_input"] ~= author_raw; + if (auto m = author_raw.match(rgx.raw_author_munge)) { + (m.captures[1]).map!toUpper.copy(_lastname); + authors_hash_arr["last_first"] ~= _lastname.data.to!string ~ ", " ~ m.captures[2]; + _lastname = appender!(char[])(); + } + } + _struct_composite.meta.creator_author = authors_arr.join(", ").chomp.chomp; + string _author_name_last_first = authors_hash_arr["last_first"].join("; ").chomp.chomp; + _struct_composite.meta.creator_author_surname_fn = (_author_name_last_first.length > 0) + ? _author_name_last_first + : authors_hash_arr["as_input"].join("; ").chomp.chomp; +} +if (_struct_composite.meta.title_main.empty) { + if ("title" in _json.object) { + if ((_json.object["title"].type().to!string) == "string") { + _struct_composite.meta.title_main = _json.object["title"].str; + } else { + if ("edition" in _json.object["title"] + && (_json.object["title"]["edition"].type().to!string == "string") + ) { + _struct_composite.meta.title_edition = _json.object["title"]["edition"].str; + } + if ("full" in _json.object["title"] + && (_json.object["title"]["full"].type().to!string == "string") + ) {} + if ("language" in _json.object["title"] + && (_json.object["title"]["language"].type().to!string == "string") + ) { + _struct_composite.meta.title_language = _json.object["title"]["language"].str; + } + if ("main" in _json.object["title"] + && (_json.object["title"]["main"].type().to!string == "string") + ) { + _struct_composite.meta.title_main = _json.object["title"]["main"].str; + } else if ("title" in _json.object["title"] + && (_json.object["title"]["title"].type().to!string == "string") + ) { + _struct_composite.meta.title_main = _json.object["title"]["title"].str; + } + if ("note" in _json.object["title"] + && (_json.object["title"]["note"].type().to!string == "string") + ) { + _struct_composite.meta.title_note = _json.object["title"]["note"].str; + } + if ("sub" in _json.object["title"] + && (_json.object["title"]["sub"].type().to!string == "string") + ) { + _struct_composite.meta.title_sub = _json.object["title"]["sub"].str; + } + if ("subtitle" in _json.object["title"] + && (_json.object["title"]["subtitle"].type().to!string == "string") + ) { + _struct_composite.meta.title_subtitle = _json.object["title"]["subtitle"].str; + } + } + } + if ((!(_struct_composite.meta.title_subtitle.empty)) + && (_struct_composite.meta.title_sub.empty)) { + _struct_composite.meta.title_sub = _struct_composite.meta.title_subtitle; + } + _struct_composite.meta.title_full = (_struct_composite.meta.title_sub.empty) + ? _struct_composite.meta.title_main + : format( + "%s - %s", + _struct_composite.meta.title_main, + _struct_composite.meta.title_sub, + ); +} if ("classify" in _json.object) { if ("dewey" in _json.object["classify"] && (_json.object["classify"]["dewey"].type().to!string == "string") @@ -834,6 +942,22 @@ if ("classify" in _json.object) { && (_json.object["classify"]["topic_register"].type().to!string == "string") ) { _struct_composite.meta.classify_topic_register = _json.object["classify"]["topic_register"].str; + string[] main_topics_ = _struct_composite.meta.classify_topic_register.split(rgx.topic_register_main_terms_split); + string[] topics; + string topics_tmp; + string[] multiple_sub_terms; + foreach (mt; main_topics_) { + topics_tmp = mt.replaceAll(rgx.topic_register_main_term_plus_rest_split, mkup.sep); + if (auto m = topics_tmp.match(rgx.topic_register_multiple_sub_terms_split)) { + multiple_sub_terms = m.captures[1].split(rgx.topic_register_sub_terms_split); + foreach (subterm; multiple_sub_terms) { + topics ~= m.captures.pre ~ mkup.sep ~ subterm; + } + } else { + topics ~= topics_tmp; + } + } + _struct_composite.meta.classify_topic_register_arr = topics; } } if ("date" in _json.object) { @@ -872,6 +996,13 @@ if ("date" in _json.object) { ) { _struct_composite.meta.date_valid = _json.object["date"]["valid"].str; } + _struct_composite.meta.author_date_title = format( + "%s %s \"%s\"", + _struct_composite.meta.creator_author_surname_fn, + (_struct_composite.meta.date_published.length > 0) + ? "(" ~ _struct_composite.meta.date_published ~ ")" : "", + _struct_composite.meta.title_full, + ); } if ("links" in _json.object) {} if ("notes" in _json.object) { @@ -956,93 +1087,6 @@ if ("rights" in _json.object) { _struct_composite.meta.rights_license = _json.object["rights"]["license"].str; } } -if (_struct_composite.meta.creator_author.empty) { - if ("creator" in _json.object) { - if ("author" in _json.object["creator"] - && (_json.object["creator"]["author"].type().to!string == "string") - ) { - _struct_composite.meta.creator_author = _json.object["creator"]["author"].str; - } - if ("email" in _json.object["creator"] - && (_json.object["creator"]["email"].type().to!string == "string") - ) { - _struct_composite.meta.creator_author_email = _json.object["creator"]["email"].str; - } - if ("illustrator" in _json.object["creator"] - && (_json.object["creator"]["illustrator"].type().to!string == "string") - ) { - _struct_composite.meta.creator_illustrator = _json.object["creator"]["illustrator"].str; - } - if ("translator" in _json.object["creator"] - && (_json.object["creator"]["translator"].type().to!string == "string") - ) { - _struct_composite.meta.creator_translator = _json.object["creator"]["translator"].str; - } - } - string[] authors_arr; - string[] authors_raw_arr - = _struct_composite.meta.creator_author.split(rgx.arr_delimiter); - foreach (author_raw; authors_raw_arr) { - authors_arr ~= author_raw.replace(rgx.raw_author_munge, "$2 $1"); - } - _struct_composite.meta.creator_author = join(authors_arr, ", ").chomp.chomp; -} -if (_struct_composite.meta.title_main.empty) { - if ("title" in _json.object) { - if ((_json.object["title"].type().to!string) == "string") { - _struct_composite.meta.title_main = _json.object["title"].str; - } else { - if ("edition" in _json.object["title"] - && (_json.object["title"]["edition"].type().to!string == "string") - ) { - _struct_composite.meta.title_edition = _json.object["title"]["edition"].str; - } - if ("full" in _json.object["title"] - && (_json.object["title"]["full"].type().to!string == "string") - ) {} - if ("language" in _json.object["title"] - && (_json.object["title"]["language"].type().to!string == "string") - ) { - _struct_composite.meta.title_language = _json.object["title"]["language"].str; - } - if ("main" in _json.object["title"] - && (_json.object["title"]["main"].type().to!string == "string") - ) { - _struct_composite.meta.title_main = _json.object["title"]["main"].str; - } else if ("title" in _json.object["title"] - && (_json.object["title"]["title"].type().to!string == "string") - ) { - _struct_composite.meta.title_main = _json.object["title"]["title"].str; - } - if ("note" in _json.object["title"] - && (_json.object["title"]["note"].type().to!string == "string") - ) { - _struct_composite.meta.title_note = _json.object["title"]["note"].str; - } - if ("sub" in _json.object["title"] - && (_json.object["title"]["sub"].type().to!string == "string") - ) { - _struct_composite.meta.title_sub = _json.object["title"]["sub"].str; - } - if ("subtitle" in _json.object["title"] - && (_json.object["title"]["subtitle"].type().to!string == "string") - ) { - _struct_composite.meta.title_subtitle = _json.object["title"]["subtitle"].str; - } - } - } - if ((!(_struct_composite.meta.title_subtitle.empty)) - && (_struct_composite.meta.title_sub.empty)) { - _struct_composite.meta.title_sub = _struct_composite.meta.title_subtitle; - } - _struct_composite.meta.title_full = (_struct_composite.meta.title_sub.empty) - ? _struct_composite.meta.title_main - : format( - "%s - %s", - _struct_composite.meta.title_main, - _struct_composite.meta.title_sub, - ); -} #+END_SRC * 2. TOML returns DocReformStruct (via JSON) :module:conf_make_meta:struct: -- cgit v1.2.3