The weekly challenge 259 - Task 2: Line Parser
1 #!/usr/bin/env perl 2 # https://theweeklychallenge.org/blog/perl-weekly-challenge-259/#TASK2 3 # 4 # Task 2: Line Parser 5 # =================== 6 # 7 # You are given a line like below: 8 # 9 # {% id field1="value1" field2="value2" field3=42 %} 10 # 11 # Where 12 # 13 # a) "id" can be \w+. 14 # b) There can be 0 or more field-value pairs. 15 # c) The name of the fields are \w+. 16 # b) The values are either number in which case we don't need double quotes or 17 # string in which case we need double quotes around them. 18 # 19 # The line parser should return structure like below: 20 # 21 # { 22 # name => id, 23 # fields => { 24 # field1 => value1, 25 # field2 => value2, 26 # field3 => value3, 27 # } 28 # } 29 # 30 # It should be able to parse the following edge cases too: 31 # 32 # {% youtube title="Title \"quoted\" done" %} 33 # 34 # and 35 # 36 # {% youtube title="Title with escaped backslash \\" %} 37 # 38 # BONUS: Extend it to be able to handle multiline tags: 39 # 40 # {% id filed1="value1" ... %} 41 # LINES 42 # {% endid %} 43 # 44 # You should expect the following structure from your line parser: 45 # 46 # { 47 # name => id, 48 # fields => { 49 # field1 => value1, 50 # field2 => value2, 51 # field3 => value3, 52 # } 53 # text => LINES 54 # } 55 # 56 ############################################################ 57 ## 58 ## discussion 59 ## 60 ############################################################ 61 # 62 # Parsers are always a bit complicated, so most solutions to 63 # this problem will probably be as well. 64 # There might also always be some corner cases that are not 65 # clear right away. In this case the description says: 66 # The values are either number in which case we don't need 67 # double quotes or string in which case we need double quotes 68 # around them. 69 # It remains unclear whether or not this includes floating point 70 # numbers or only integers, so I decided to at least also support 71 # decimal number written with a single "." like 12.34, but not 72 # the full list of possible representations of floats like 1E5 or 73 # similar stuff. 74 # A full parser usually needs to split everything into tokens which 75 # can then be processed one by one, and regular expressions are not 76 # enough to build a full parser in most cases. In our case, I used 77 # a mixed approach: handle everything with regular expressions 78 # that is easy to handle that way. However, parsing strings is done 79 # by handing in the rest of the current line into a function that 80 # will then return the string at the beginning of the line and the 81 # remainder of the input once the string is removed. This function 82 # parses the rest of the line one character at a time, no regular 83 # expressions there. The loop that calls this function goes line 84 # at a time, always picking up larger chunks of the line by using 85 # regular expressions. This outer loop makes use of the Switch 86 # module unnecessarily as my first intention was to implement a 87 # full character-by-character parser which would have required 88 # some kind of a state machine to always know where in the parsing 89 # process we are at any time, but then I decided to try an 90 # approach making use of regular expressions, so a single variable 91 # could have taken care of everything (Just keep the sectionname 92 # in a variable, and if empty assume we're in the default state 93 # and take it from there), but then I wouldn't have had the 94 # opportunity to try out the Switch module, so I kept the code as 95 # it was. 96 97 use strict; 98 use warnings; 99 use Switch; 100 use Data::Dumper; 101 102 my $DEFAULT = 'Default'; 103 my $INSECTION = 'InSection'; 104 my $SECTIONNAME = ""; 105 106 my @ALL_DATA; # the result will be stored here 107 my $CURRENT_DATA = {}; # for temporarily storing the data we're currently parsing 108 my $state = $DEFAULT; # initialize the state machine 109 110 my $input = <<EOF; 111 {% id field1="value1" field2="value2" field3=42 %} 112 {% endid %} 113 Some random data somewhere in between 114 {% youtube title="Title \\"quoted\\" done" %} 115 {% endyoutube %} 116 {% youtube title="Title with escaped backslash \\\\" %} 117 Some data 118 More data 119 {% endyoutube %} 120 {% id field1="value1" field2="value2" field3=42.32 field4="Hello, world!" %} 121 {% endid %} 122 {% foo %} 123 Bar. 124 {% endfoo %} 125 EOF 126 my @lines = split /\n/, $input; 127 128 foreach my $line (@lines) { 129 switch($state) { 130 case "$DEFAULT" { 131 # not inside a section, so we expect either the begin of a new section 132 # or a plain string to ingest 133 if($line =~ m/^\s*\{\%\s*(\w+)/) { # new section starts 134 # keep name of section and switch to correct state 135 $SECTIONNAME = $1; 136 $state = $INSECTION; 137 # remove start of line including the section name 138 $line =~ s/\s*\{\%\s*$SECTIONNAME\s+//; 139 # initialize temporary data structure's name 140 $CURRENT_DATA->{name} = $SECTIONNAME; 141 while(length($line) > 0) { # ingest rest of line 142 # remove unnecessary whitespace 143 $line =~ s/^\s+//; 144 if($line =~ m/^\s*\%\}\s*$/) { 145 # we found the end of the line, let's just set everything 146 # into a state that will break out of the loop. 147 $line = ""; 148 } else { 149 # now we have a new field at the beginning of the line, so 150 # we remove (and capture) the field name 151 $line =~ s/^(\w+)=//; 152 my $key = $1; 153 if($line =~ m/^(\d+)(\.\d+){0,1}(\s|\%\})/) { 154 # number at beginning of line, remove it and 155 # capture it in the temporary data structure 156 $line =~ s/^(\d+(\.\d+){0,1})//; 157 $CURRENT_DATA->{fields}->{$key} = $1; 158 } elsif ( $line =~ m/^"/ ) { 159 # string at the beginning of the line, let's hand the 160 # function into the parser function for this case, then 161 # store the resulting string into the temporary data 162 # structure, keeping the remainder in $line for further 163 # processing in the next iteration of the loop 164 my $string; 165 ($string, $line) = parse_next_string($line); 166 $CURRENT_DATA->{fields}->{$key} = $string; 167 } else { 168 die "Parse error, value neither number nor string here ->$line!"; 169 } 170 } 171 } 172 } else { # plain string to ingest 173 push @ALL_DATA, $line; 174 } 175 } 176 case "$INSECTION" { 177 # either we find the end of the current section, or we copy the line as-is 178 # to the text inside our data structure 179 if($line =~ m/^\s*\{\%\s*end$SECTIONNAME\s*\%\}\s*$/) { 180 # end of section: store current temporary data into the result, 181 # reset the temporary data structure and switch back to the default 182 # state 183 push @ALL_DATA, $CURRENT_DATA; 184 $CURRENT_DATA = {}; 185 $state = $DEFAULT; 186 } else { 187 $CURRENT_DATA->{text} .= "$line\n"; 188 } 189 } 190 else { die "Unknown state $state!\n"; } 191 } 192 } 193 foreach my $entry (@ALL_DATA) { 194 print Dumper $entry; 195 } 196 197 # given the rest of a line, split this into the string at the beginning 198 # of the line and the remainder of the line after removing this string 199 sub parse_next_string { 200 my $input = shift; 201 my @chars = split //, $input; 202 my ($string, $rest) = ("", ""); 203 die "Not a string" unless $chars[0] eq '"'; 204 my $index = 1; 205 while($index <= $#chars) { 206 if($chars[$index] eq '"') { # closing '"', finish everything up 207 $rest = substr($input, $index+1); 208 return ($string, $rest); 209 } elsif ($chars[$index] eq "\\") { 210 # if we find a '\', we just ingest the next character as is. 211 # This means '\\' turns into a single '\', '\"' turns into 212 # a single '"', and other stuff like '\a' turns into 'a'. 213 # No need to make things more complicated in this case. 214 $index++; 215 $string .= $chars[$index]; 216 } else { 217 # everything apart from '\' or '"' just indicates a character 218 # to be ingested as-is. 219 $string .= $chars[$index]; 220 } 221 $index++; 222 } 223 # if we end up here, there was no matching '"' 224 # so we ran into a parsing error 225 die "Parser error: Could not find closing '\"' in $input"; 226 }