The weekly challenge 376 - Task 2: Doubled Words

 1 #!/usr/bin/env perl
 2 # https://theweeklychallenge.org/blog/perl-weekly-challenge-376/#TASK2
 3 #
 4 # Task 2: Doubled Words
 5 # =====================
 6 #
 7 # You are given a string (which may contain embedded newlines) which is taken
 8 # from a page on a website. The string will not contain brackets qw{ [ ] }.
 9 #
10 # Write a script that will find doubled words (such as “this this”) and
11 # highlight (wrap in brackets) each doubled word.
12 #
13 # The script should:
14 #
15 #   - Work across lines, even finding situations where a word at the end of
16 #     one line is repeated at the beginning of the next.
17 #
18 #   - Find doubled words despite capitalization differences, such as with
19 #     'The the...', as well as allow differing amounts of whitespace (spaces,
20 #     tabs, newlines, and the like) to lie between the words.
21 #
22 #   - Find doubled words even when separated by HTML tags. For example, to
23 #     make a word bold: '...it is <B>very</B> very important...'. Only show
24 #     lines containing doubled words.
25 #
26 # Example 1
27 #
28 # Input: $str = "you're given the job of checking the pages on a\nweb server
29 # for doubled words (such as 'this this'), a common problem\nwith documents
30 # subject to heavy editing."
31 # Output: "web server for doubled words (such as '[this] [this]'), a common problem"
32 #
33 # Example 2
34 #
35 # Input: $str = "Find doubled words despite capitalization differences, such as
36 # with 'The\nthe...', as well as allow differing amounts of whitespace
37 # (spaces,\ntabs, newlines, and the like) to lie between the words."
38 # Output: "Find doubled words despite capitalization differences, such as with
39 # '[The]\n[the]...', as well as allow differing amounts of whitespace (spaces,"
40 #
41 # Example 3
42 #
43 # Input: $str = "to make a word bold: '...it is <B>very</B> very important...'."
44 # Output: "to make a word bold: '...it is <B>[very]</B> [very] important...'."
45 #
46 # Example 4
47 #
48 # Input: $str = "Perl officially stands for Practical Extraction and Report
49 # Language, except when it doesn't."
50 # Output: ""
51 #
52 # Example 5
53 #
54 # Input: $str = "There's more than one one way to do it.\nEasy things should be
55 # easy and hard things should be possible."
56 # Output: "There's more than [one] [one] way to do it."
57 #
58 ############################################################
59 ##
60 ## discussion
61 ##
62 ############################################################
63 #
64 # This is a fun regex one. We capture the first word, make sure there is nothing else
65 # than whitespace or HTML tags between it and the next word, and the next word matches
66 # the first one as well. We replace all of that by whatever we captured (both the words
67 # and the stuff between them) with the same stuff, but the words are inside [ and ].
68 # For the output we just need to add a little magic to only print actual result lines.
69 
70 use v5.36;
71 
72 doubled_words("you're given the job of checking the pages on a
73 web server for doubled words (such as 'this this'), a common problem
74 with documents subject to heavy editing.");
75 doubled_words("Find doubled words despite capitalization differences, such as with 'The
76 the...', as well as allow differing amounts of whitespace (spaces,
77 tabs, newlines, and the like) to lie between the words.");
78 doubled_words("to make a word bold: '...it is <B>very</B> very important...'.");
79 doubled_words("Perl officially stands for Practical Extraction and Report Language, except when it doesn't.");
80 doubled_words("There's more than one one way to do it.
81 Easy things should be easy and hard things should be possible.");
82 
83 sub doubled_words($str) {
84     say "Input: \"$str\"";
85     $str =~ s/(\b\w+\b)((\s+|<[^>]*>)*)(\1)/[$1]$2\[$4]/gis;
86     my @lines = split /\n/, $str;
87     print "Output: \"";
88     my $printed = 0;
89     foreach my $line (@lines) {
90         if($line =~ m/\[/) {
91             print "\n" if $printed;
92             print $line;
93             $printed = 1;
94         }
95     }
96     say "\"";
97 }