The weekly challenge 376 - Task 2: Doubled Words
1 #!/usr/bin/env perl 2 # https://theweeklychallenge.org/blog/perl-weekly-challenge-376/#TASK2 3 # 4 # Task 2: Doubled Words 5 # ===================== 6 # 7 # You are given a string (which may contain embedded newlines) which is taken 8 # from a page on a website. The string will not contain brackets qw{ [ ] }. 9 # 10 # Write a script that will find doubled words (such as “this this”) and 11 # highlight (wrap in brackets) each doubled word. 12 # 13 # The script should: 14 # 15 # - Work across lines, even finding situations where a word at the end of 16 # one line is repeated at the beginning of the next. 17 # 18 # - Find doubled words despite capitalization differences, such as with 19 # 'The the...', as well as allow differing amounts of whitespace (spaces, 20 # tabs, newlines, and the like) to lie between the words. 21 # 22 # - Find doubled words even when separated by HTML tags. For example, to 23 # make a word bold: '...it is <B>very</B> very important...'. Only show 24 # lines containing doubled words. 25 # 26 # Example 1 27 # 28 # Input: $str = "you're given the job of checking the pages on a\nweb server 29 # for doubled words (such as 'this this'), a common problem\nwith documents 30 # subject to heavy editing." 31 # Output: "web server for doubled words (such as '[this] [this]'), a common problem" 32 # 33 # Example 2 34 # 35 # Input: $str = "Find doubled words despite capitalization differences, such as 36 # with 'The\nthe...', as well as allow differing amounts of whitespace 37 # (spaces,\ntabs, newlines, and the like) to lie between the words." 38 # Output: "Find doubled words despite capitalization differences, such as with 39 # '[The]\n[the]...', as well as allow differing amounts of whitespace (spaces," 40 # 41 # Example 3 42 # 43 # Input: $str = "to make a word bold: '...it is <B>very</B> very important...'." 44 # Output: "to make a word bold: '...it is <B>[very]</B> [very] important...'." 45 # 46 # Example 4 47 # 48 # Input: $str = "Perl officially stands for Practical Extraction and Report 49 # Language, except when it doesn't." 50 # Output: "" 51 # 52 # Example 5 53 # 54 # Input: $str = "There's more than one one way to do it.\nEasy things should be 55 # easy and hard things should be possible." 56 # Output: "There's more than [one] [one] way to do it." 57 # 58 ############################################################ 59 ## 60 ## discussion 61 ## 62 ############################################################ 63 # 64 # This is a fun regex one. We capture the first word, make sure there is nothing else 65 # than whitespace or HTML tags between it and the next word, and the next word matches 66 # the first one as well. We replace all of that by whatever we captured (both the words 67 # and the stuff between them) with the same stuff, but the words are inside [ and ]. 68 # For the output we just need to add a little magic to only print actual result lines. 69 70 use v5.36; 71 72 doubled_words("you're given the job of checking the pages on a 73 web server for doubled words (such as 'this this'), a common problem 74 with documents subject to heavy editing."); 75 doubled_words("Find doubled words despite capitalization differences, such as with 'The 76 the...', as well as allow differing amounts of whitespace (spaces, 77 tabs, newlines, and the like) to lie between the words."); 78 doubled_words("to make a word bold: '...it is <B>very</B> very important...'."); 79 doubled_words("Perl officially stands for Practical Extraction and Report Language, except when it doesn't."); 80 doubled_words("There's more than one one way to do it. 81 Easy things should be easy and hard things should be possible."); 82 83 sub doubled_words($str) { 84 say "Input: \"$str\""; 85 $str =~ s/(\b\w+\b)((\s+|<[^>]*>)*)(\1)/[$1]$2\[$4]/gis; 86 my @lines = split /\n/, $str; 87 print "Output: \""; 88 my $printed = 0; 89 foreach my $line (@lines) { 90 if($line =~ m/\[/) { 91 print "\n" if $printed; 92 print $line; 93 $printed = 1; 94 } 95 } 96 say "\""; 97 }