regex review, match speed & compile time, ctregex

- improve match time - add interim fontface identifier marker - improve compile time - remove unused regexs - separate out some specialized output matches
author: Ralph Amissah <ralph.amissah@gmail.com> 2022-11-25 22:06:40 -0500
committer: Ralph Amissah <ralph.amissah@gmail.com> 2022-12-23 18:17:41 -0500
commit: f6d28b62f0e02b8a88a1832589e203c7a613f45b (patch)
tree: b5d6462e45bae998190194784e02b143a83f79a3 /org/default_regex.org
parent: gitignore & things nix (diff)
1 files changed, 140 insertions, 58 deletions
diff --git a/org/default_regex.org b/org/default_regex.org
index 89d6ea3..976baa0 100644
--- a/org/default_regex.org
+++ b/org/default_regex.org
@@ -67,7 +67,6 @@ static template spineRgxIn() {
     <<meta_rgx_bibliography>>
     <<meta_rgx_book_index_split>>
     <<meta_rgx_topic_register_split>>
-    <<meta_rgx_language_codes>>
     <<prgmkup_rgx_spaces>>
     <<prgmkup_rgx_filename_and_path>>
     <<prgmkup_rgx_inline_breaks>>
@@ -86,10 +85,6 @@ static template spineRgxIn() {
 /+ misc +/
 static flag_action                                    = ctRegex!(`^(--[a-z][a-z0-9-]+)$`);
 static within_quotes                                  = ctRegex!(`"(.+?)"`, "m");
-static yaml_tag_is_str                                = ctRegex!(`:str$`);
-static yaml_tag_is_int                                = ctRegex!(`:int$`);
-static yaml_tag_is_map                                = ctRegex!(`:map$`);
-static yaml_tag_is_seq                                = ctRegex!(`:seq$`);
 static make_heading_delimiter                         = ctRegex!(`[;][ ]*`);
 static arr_delimiter                                  = ctRegex!(`[ ]*[;][ ]*`);
 static name_delimiter                                 = ctRegex!(`^([^,]+)[ ]*,[ ]+(.+?)$`);
@@ -476,8 +471,6 @@ static template spineRgxOut() {
     <<prgmkup_rgx_inline_links>>
     <<prgmkup_rgx_inline_font_face>>
     <<prgmkup_rgx_table>>
-    <<sp_ch_xhtml_rgx>>
-    <<sp_ch_latex_rgx>>
     <<grouped_text_rgx_paragraph_marks>>
   }
 }
@@ -492,35 +485,22 @@ static make_breakpage                           = ctRegex!(`new=(?P<breakpage>.+
 static make_breakcolumn                         = ctRegex!(`break=(?P<breakcolumn>.+?)(?:;|$)`,);
 #+END_SRC
 
-** special characters
-*** xhtml special characters
+* 2. ctRegex defaults shared by meta & output (generic)
 
-#+NAME: sp_ch_xhtml_rgx
+** meta
+
+#+NAME: prgmkup_rgx_meta
 #+BEGIN_SRC d
-static xhtml_ampersand                          = ctRegex!(`[&]`, "m");      // &amp;
-static xhtml_quotation                          = ctRegex!(`["]`, "m");      // &quot;
-static xhtml_less_than                          = ctRegex!(`[<]`, "m");      // &lt;
-static xhtml_greater_than                       = ctRegex!(`[>]`, "m");      // &gt;
-static xhtml_line_break                         = ctRegex!(` [\\]{2}`, "m"); // <br />
+static space                                    = ctRegex!(`[ ]`, "mg");
+static spaces_keep                              = ctRegex!(`(?P<keep_spaces>^[ ]+|[ ]{2,})`, "mg"); // code, verse, block
 #+END_SRC
 
-*** latex special characters
+** spine & source_in
 
-#+NAME: sp_ch_latex_rgx
+#+NAME: prgmkup_rgx_in
 #+BEGIN_SRC d
-static latex_special_char                       = ctRegex!(`([%${}_#&\\])`);
-static latex_special_char_for_escape            = ctRegex!(`([%${}_#\\])`);
-static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`);
-static latex_special_char_for_escape_url        = ctRegex!(`([%])`);
-static latex_special_char_escaped               = ctRegex!(`\\([%${}_#\\])`);
-static latex_special_char_escaped_braced        = ctRegex!(`[{]\\([&])[}]`);
-static latex_identify_inline_link               = ctRegex!(`┥.+?┝┤\S+?├`, "mg");
-static latex_identify_inline_fontface           = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg");
-static latex_clean_internal_link                = ctRegex!(`^(?:#|¤\S+?#)`, "m");
-static latex_clean_bookindex_linebreak          = ctRegex!(`\s*\\\\\\\\\s*`, "m");
 #+END_SRC
 
-* 2. ctRegex defaults shared by meta & output (generic)
 ** misc generic
 
 #+NAME: prgmkup_rgx_spaces
@@ -534,24 +514,6 @@ static nbsp_chars                               = ctRegex!(`[░]+`, "mg");
 static middle_dot                               = ctRegex!(`·`, "mg");
 #+END_SRC
 
-** filename (& path) (including insert file) :insert:file:path:filename:
-
-#+NAME: prgmkup_rgx_filename_and_path
-#+BEGIN_SRC d
-static src_pth_sst_or_ssm                       = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`);
-static src_pth_pod_sst_or_ssm                   = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`);
-static src_pth_contents                         = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`);
-static src_pth_zip                              = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`);
-static src_pth_types                            = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`);
-static src_fn                                   =
-  ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`);
-static src_fn_master                            = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`);
-static src_fn_find_inserts                      = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`);
-static insert_src_fn_ssi_or_sst                 = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`);
-static src_base_parent_dir_name                 = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
-static src_formalised_file_path_parts           = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
-#+END_SRC
-
 ** inline markup
 
 *** inline breaks
@@ -666,21 +628,21 @@ static quotation_mark_sql_insert_delimiter      = ctRegex!("[']", "mg");
 #+NAME: prgmkup_rgx_inline_font_face
 #+BEGIN_SRC d
 /+ inline markup font face mod +/
-static inline_emphasis                          = ctRegex!(`[*]┨(?P<text>.+?)┣[*]`, "mg");
-static inline_bold                              = ctRegex!(`[!]┨(?P<text>.+?)┣[!]`, "mg");
-static inline_underscore                        = ctRegex!(`[_]┨(?P<text>.+?)┣[_]`, "mg");
-static inline_italics                           = ctRegex!(`[/]┨(?P<text>.+?)┣[/]`, "mg");
-static inline_superscript                       = ctRegex!(`\^┨(?P<text>.+?)┣\^`, "mg");
-static inline_subscript                         = ctRegex!(`[,]┨(?P<text>.+?)┣[,]`, "mg");
-static inline_strike                            = ctRegex!(`[-]┨(?P<text>.+?)┣[-]`, "mg");
-static inline_insert                            = ctRegex!(`[+]┨(?P<text>.+?)┣[+]`, "mg");
-static inline_mono                              = ctRegex!(`[■]┨(?P<text>.+?)┣[■]`, "mg");
-static inline_cite                              = ctRegex!(`[‖]┨(?P<text>.+?)┣[‖]`, "mg");
+static inline_emphasis                          = ctRegex!(`⑆[*]┨(?P<text>.+?)┣[*]`, "mg");
+static inline_bold                              = ctRegex!(`⑆[!]┨(?P<text>.+?)┣[!]`, "mg");
+static inline_underscore                        = ctRegex!(`⑆[_]┨(?P<text>.+?)┣[_]`, "mg");
+static inline_italics                           = ctRegex!(`⑆[/]┨(?P<text>.+?)┣[/]`, "mg");
+static inline_superscript                       = ctRegex!(`⑆\^┨(?P<text>.+?)┣\^`, "mg");
+static inline_subscript                         = ctRegex!(`⑆[,]┨(?P<text>.+?)┣[,]`, "mg");
+static inline_strike                            = ctRegex!(`⑆[-]┨(?P<text>.+?)┣[-]`, "mg");
+static inline_insert                            = ctRegex!(`⑆[+]┨(?P<text>.+?)┣[+]`, "mg");
+static inline_mono                              = ctRegex!(`⑆[■]┨(?P<text>.+?)┣[■]`, "mg");
+static inline_cite                              = ctRegex!(`⑆[‖]┨(?P<text>.+?)┣[‖]`, "mg");
 #+END_SRC
 
 #+BEGIN_SRC d
-// static inline_superscript                    = ctRegex!(`[\^]┨(?P<text>.+?)┣[\^]`, "mg");
-// static inline_fontface_clean                 = ctRegex!(`[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg");
+// static inline_superscript                             = ctRegex!(`⑆[\^]┨(?P<text>.+?)┣[\^]`, "mg");
+// static inline_fontface_clean                          = ctRegex!(`⑆[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg");
 #+END_SRC
 
 *** table related
@@ -692,6 +654,126 @@ static table_delimiter_col                      = ctRegex!("[ ]*[┊][ ]*", "mg"
 static table_delimiter_row                      = ctRegex!("[ ]*\n", "mg");
 #+END_SRC
 
+** files filename (& path) (including insert file) :insert:file:path:filename:
+
+#+HEADER: :tangle "../src/doc_reform/meta/rgx_files.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+  regex: regular expressions used in sisu document parser
++/
+module doc_reform.meta.rgx_files;
+static template spineRgxFiles() {
+  static struct RgxFiles {
+    <<prgmkup_rgx_filename_and_path>>
+    <<meta_rgx_language_codes>>
+  }
+}
+#+END_SRC
+
+#+NAME: prgmkup_rgx_filename_and_path
+#+BEGIN_SRC d
+static src_pth_sst_or_ssm                       = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`);
+static src_pth_pod_sst_or_ssm                   = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`);
+static src_pth_contents                         = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`);
+static src_pth_zip                              = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`);
+static src_pth_types                            = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`);
+static src_fn                                   = ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`);
+static src_fn_master                            = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`);
+static src_fn_find_inserts                      = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`);
+static insert_src_fn_ssi_or_sst                 = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`);
+static src_base_parent_dir_name                 = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
+static src_formalised_file_path_parts           = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
+#+END_SRC
+
+** _module template yaml tags
+
+#+HEADER: :tangle "../src/doc_reform/meta/rgx_yaml_tags.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+  regex: regular expressions used in sisu document parser
++/
+module doc_reform.meta.rgx_yaml;
+static template spineRgxYamlTags() {
+  static struct RgxYaml {
+    <<meta_rgx_yaml>>
+  }
+}
+#+END_SRC
+
+#+NAME: meta_rgx_yaml
+#+BEGIN_SRC d
+static yaml_tag_is_str                          = ctRegex!(`:str$`);
+static yaml_tag_is_int                          = ctRegex!(`:int$`);
+static yaml_tag_is_map                          = ctRegex!(`:map$`);
+static yaml_tag_is_seq                          = ctRegex!(`:seq$`);
+#+END_SRC
+
+** special characters
+*** xhtml special characters template
+
+#+HEADER: :tangle "../src/doc_reform/io_out/rgx_xhtml.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+  regex: regular expressions used in sisu document parser
++/
+module doc_reform.io_out.rgx_xhtml;
+static template spineRgxXHTML() {
+  static struct RgxXHTML {
+    <<sp_ch_xhtml_rgx>>
+  }
+}
+#+END_SRC
+
+*** xhtml special characters
+
+#+NAME: sp_ch_xhtml_rgx
+#+BEGIN_SRC d
+static ampersand                                = ctRegex!(`[&]`, "m");      // &amp;
+static quotation                                = ctRegex!(`["]`, "m");      // &quot;
+static less_than                                = ctRegex!(`[<]`, "m");      // &lt;
+static greater_than                             = ctRegex!(`[>]`, "m");      // &gt;
+static line_break                               = ctRegex!(` [\\]{2}`, "m"); // <br />
+#+END_SRC
+
+*** LaTeX special characters template
+
+#+HEADER: :tangle "../src/doc_reform/io_out/rgx_latex.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+  regex: regular expressions used in sisu document parser
++/
+module doc_reform.io_out.rgx_latex;
+static template spineRgxLSC() {
+  static struct RgxLSC {
+    <<sp_ch_latex_rgx>>
+  }
+}
+#+END_SRC
+
+*** latex special characters
+
+#+NAME: sp_ch_latex_rgx
+#+BEGIN_SRC d
+static latex_special_char                       = ctRegex!(`([%${}_#&\\])`);
+static latex_special_char_for_escape            = ctRegex!(`([%${}_#\\])`);
+static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`);
+static latex_special_char_for_escape_url        = ctRegex!(`([%])`);
+static latex_special_char_escaped               = ctRegex!(`\\([%${}_#\\])`);
+static latex_special_char_escaped_braced        = ctRegex!(`[{]\\([&])[}]`);
+static latex_identify_inline_link               = ctRegex!(`┥.+?┝┤\S+?├`, "mg");
+static latex_identify_inline_fontface           = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg");
+static latex_clean_internal_link                = ctRegex!(`^(?:#|¤\S+?#)`, "m");
+static latex_clean_bookindex_linebreak          = ctRegex!(`\s*\\\\\\\\\s*`, "m");
+#+END_SRC
+
 * document header including copyright & license
 
 #+NAME: doc_header_including_copyright_and_license
author	Ralph Amissah <ralph.amissah@gmail.com>	2022-11-25 22:06:40 -0500
committer	Ralph Amissah <ralph.amissah@gmail.com>	2022-12-23 18:17:41 -0500
commit	f6d28b62f0e02b8a88a1832589e203c7a613f45b (patch)
tree	b5d6462e45bae998190194784e02b143a83f79a3 /org/default_regex.org
parent	gitignore & things nix (diff)