perl logo Perl logo (Thanks to Olaf Alders)

The weekly challenge 259 - Task 2: Line Parser

  1 #!/usr/bin/env perl
  2 # https://theweeklychallenge.org/blog/perl-weekly-challenge-259/#TASK2
  3 #
  4 # Task 2: Line Parser
  5 # ===================
  6 #
  7 # You are given a line like below:
  8 #
  9 # {%  id   field1="value1"    field2="value2"  field3=42 %}
 10 #
 11 # Where
 12 #
 13 # a) "id" can be \w+.
 14 # b) There can be 0  or more field-value pairs.
 15 # c) The name of the fields are \w+.
 16 # b) The values are either number in which case we don't need double quotes or
 17 #    string in which case we need double quotes around them.
 18 #
 19 # The line parser should return structure like below:
 20 #
 21 # {
 22 #        name => id,
 23 #        fields => {
 24 #            field1 => value1,
 25 #            field2 => value2,
 26 #            field3 => value3,
 27 #        }
 28 # }
 29 #
 30 # It should be able to parse the following edge cases too:
 31 #
 32 # {%  youtube title="Title \"quoted\" done" %}
 33 #
 34 # and
 35 #
 36 # {%  youtube title="Title with escaped backslash \\" %}
 37 #
 38 # BONUS: Extend it to be able to handle multiline tags:
 39 #
 40 # {% id  filed1="value1" ... %}
 41 # LINES
 42 # {% endid %}
 43 #
 44 # You should expect the following structure from your line parser:
 45 #
 46 # {
 47 #        name => id,
 48 #        fields => {
 49 #            field1 => value1,
 50 #            field2 => value2,
 51 #            field3 => value3,
 52 #        }
 53 #        text => LINES
 54 # }
 55 #
 56 ############################################################
 57 ##
 58 ## discussion
 59 ##
 60 ############################################################
 61 #
 62 # Parsers are always a bit complicated, so most solutions to
 63 # this problem will probably be as well.
 64 # There might also always be some corner cases that are not
 65 # clear right away. In this case the description says:
 66 #    The values are either number in which case we don't need
 67 #    double quotes or string in which case we need double quotes
 68 #    around them.
 69 # It remains unclear whether or not this includes floating point
 70 # numbers or only integers, so I decided to at least also support
 71 # decimal number written with a single "." like 12.34, but not
 72 # the full list of possible representations of floats like 1E5 or
 73 # similar stuff.
 74 # A full parser usually needs to split everything into tokens which
 75 # can then be processed one by one, and regular expressions are not
 76 # enough to build a full parser in most cases. In our case, I used
 77 # a mixed approach: handle everything with regular expressions
 78 # that is easy to handle that way. However, parsing strings is done
 79 # by handing in the rest of the current line into a function that
 80 # will then return the string at the beginning of the line and the
 81 # remainder of the input once the string is removed. This function
 82 # parses the rest of the line one character at a time, no regular
 83 # expressions there. The loop that calls this function goes line
 84 # at a time, always picking up larger chunks of the line by using
 85 # regular expressions. This outer loop makes use of the Switch
 86 # module unnecessarily as my first intention was to implement a
 87 # full character-by-character parser which would have required
 88 # some kind of a state machine to always know where in the parsing
 89 # process we are at any time, but then I decided to try an
 90 # approach making use of regular expressions, so a single variable
 91 # could have taken care of everything (Just keep the sectionname
 92 # in a variable, and if empty assume we're in the default state
 93 # and take it from there), but then I wouldn't have had the
 94 # opportunity to try out the Switch module, so I kept the code as
 95 # it was.
 96 
 97 use strict;
 98 use warnings;
 99 use Switch;
100 use Data::Dumper;
101 
102 my $DEFAULT = 'Default';
103 my $INSECTION = 'InSection';
104 my $SECTIONNAME = "";
105 
106 my @ALL_DATA; # the result will be stored here
107 my $CURRENT_DATA = {}; # for temporarily storing the data we're currently parsing
108 my $state = $DEFAULT; # initialize the state machine
109 
110 my $input = <<EOF;
111 {%  id   field1="value1"    field2="value2"  field3=42 %}
112 {%  endid %}
113 Some random data somewhere in between
114 {%  youtube title="Title \\"quoted\\" done" %}
115 {% endyoutube %}
116 {%  youtube title="Title with escaped backslash \\\\" %}
117 Some data
118 More data
119 {% endyoutube %}
120 {%  id   field1="value1"    field2="value2"  field3=42.32 field4="Hello, world!" %}
121 {%  endid %}
122 {%  foo %}
123 Bar.
124 {%  endfoo %}
125 EOF
126 my @lines = split /\n/, $input;
127 
128 foreach my $line (@lines) {
129    switch($state) {
130       case "$DEFAULT" {
131          # not inside a section, so we expect either the begin of a new section
132          # or a plain string to ingest
133          if($line =~ m/^\s*\{\%\s*(\w+)/) { # new section starts
134             # keep name of section and switch to correct state
135             $SECTIONNAME = $1;
136             $state = $INSECTION;
137             # remove start of line including the section name
138             $line =~ s/\s*\{\%\s*$SECTIONNAME\s+//;
139             # initialize temporary data structure's name
140             $CURRENT_DATA->{name} = $SECTIONNAME;
141             while(length($line) > 0) { # ingest rest of line
142                # remove unnecessary whitespace
143                $line =~ s/^\s+//;
144                if($line =~ m/^\s*\%\}\s*$/) {
145                   # we found the end of the line, let's just set everything
146                   # into a state that will break out of the loop.
147                   $line = "";
148                } else {
149                   # now we have a new field at the beginning of the line, so
150                   # we remove (and capture) the field name
151                   $line =~ s/^(\w+)=//;
152                   my $key = $1;
153                   if($line =~ m/^(\d+)(\.\d+){0,1}(\s|\%\})/) {
154                      # number at beginning of line, remove it and
155                      # capture it in the temporary data structure
156                      $line =~ s/^(\d+(\.\d+){0,1})//;
157                      $CURRENT_DATA->{fields}->{$key} = $1;
158                   } elsif ( $line =~ m/^"/ ) {
159                      # string at the beginning of the line, let's hand the
160                      # function into the parser function for this case, then
161                      # store the resulting string into the temporary data
162                      # structure, keeping the remainder in $line for further
163                      # processing in the next iteration of the loop
164                      my $string;
165                      ($string, $line) = parse_next_string($line);
166                      $CURRENT_DATA->{fields}->{$key} = $string;
167                   } else {
168                      die "Parse error, value neither number nor string here ->$line!";
169                   }
170                }
171             }
172          } else { # plain string to ingest
173             push @ALL_DATA, $line;
174          }
175       }
176       case "$INSECTION" {
177          # either we find the end of the current section, or we copy the line as-is
178          # to the text inside our data structure
179          if($line =~ m/^\s*\{\%\s*end$SECTIONNAME\s*\%\}\s*$/) {
180             # end of section: store current temporary data into the result,
181             # reset the temporary data structure and switch back to the default
182             # state
183             push @ALL_DATA, $CURRENT_DATA;
184             $CURRENT_DATA = {};
185             $state = $DEFAULT;
186          } else {
187             $CURRENT_DATA->{text} .= "$line\n";
188          }
189       }
190       else { die "Unknown state $state!\n"; }
191    }
192 }
193 foreach my $entry (@ALL_DATA) {
194    print Dumper $entry;
195 }
196 
197 # given the rest of a line, split this into the string at the beginning
198 # of the line and the remainder of the line after removing this string
199 sub parse_next_string {
200    my $input = shift;
201    my @chars = split //, $input;
202    my ($string, $rest) = ("", "");
203    die "Not a string" unless $chars[0] eq '"';
204    my $index = 1;
205    while($index <= $#chars) {
206       if($chars[$index] eq '"') { # closing '"', finish everything up
207          $rest = substr($input, $index+1);
208          return ($string, $rest);
209       } elsif ($chars[$index] eq "\\") {
210          # if we find a '\', we just ingest the next character as is.
211          # This means '\\' turns into a single '\', '\"' turns into
212          # a single '"', and other stuff like '\a' turns into 'a'.
213          # No need to make things more complicated in this case.
214          $index++;
215          $string .= $chars[$index];
216       } else {
217          # everything apart from '\' or '"' just indicates a character
218          # to be ingested as-is.
219          $string .= $chars[$index];
220       }
221       $index++;
222    }
223    # if we end up here, there was no matching '"'
224    # so we ran into a parsing error
225    die "Parser error: Could not find closing '\"' in $input";
226 }