/* 

  Read test in Picat.

  This is one of my standard test when learning a new programming
  language: read a word file and filters the words that match 
  the regular expression
      a.*b.*c..., b.*c.*d.., c.*d.*e..., etc

  Since Picat currently don't support regular expressions I've
  done some replacements...

  The pattern with most words found is this pattern of length 5:
      .*a.*b.*c.*d.*e.* 
  matching these 26 words:

    abecedaire,
    abecedaries,
    abjectedness,
    aborticide,
    absconded,
    abscondedly,
    abscondence,
    absconder,
    absconders,
    abstractedness,
    amblycephalidae,
    ambuscade,
    ambuscaded, 
    ambuscader,
    ambuscades,
    ambuscadoed,
    amebicide,
    amoebicide,
    bambocciade,
    bambochade,
    carbacidometer,
    cerambycidae,
    nonabstractedness,
    oxylabracidae,
    scabicide,
    unabstractedness


  Here are some testing the wordlist words_lower.txt (with 415834 words) 
  on 5 length patterns. check3/2 (quite simple and using "plain" recursion) 
  is by far the fastest:

  Here are the tests:

   * go/0, testing check/2: 8.664s
   * go2/0, testing check2/2: 9.756s
   * go3/0, testing check3/2:  1.51s (the fastest)
   * go3b/0, testing check3/2 and list comprehension: 1.712ss
   * go4/0, testing test_all/3 and check3/2: 21.297s
   * go7/0, testing DCG approach:
            - check_dcg3/3: 5.482s, 
            - check_dcg4/3: 3.88s

   * go5/0 is a simple DCG tests (check_dcg/3) on the pattern "abc".
     go6 doesn't work correctly.


  This Picat model was created by Hakan Kjellerstrand, hakank@gmail.com
  See also my Picat page: http://www.hakank.org/picat/

*/


import util.

main => go3.

%
% using check/2: 8.664s
% [[26,abcde],[23,rstuv],[23,lmnop],[6,efghi],[3,klmno],[1,defgh]]
% CPU time 17.5s seconds.
%
go =>
   Words = read_file_lines("words_lower.txt"),
   % Words = read_file_lines("eng_dict.txt"),   
   % Words = read_file_lines("/usr/share/dict/words"),
   % Words = read_file_lines("sv_spelling_org_utf8.txt"),
   Alpha = lower(),
   Len = 5,
   AllMatch = [],
   Patterns = [ [Alpha[J] : J in I..I+Len-1] : I in 1..Alpha.length-Len+1],
   foreach(Pattern in Patterns) 
      println(pattern=Pattern),
      Match = [],
      foreach(Word in Words,check(Word,Pattern))
        Match := Match ++ [Word]
      end,
      println(Match),
      println(len=Match.length),
      if Match.length > 0 then
         AllMatch := AllMatch ++ [[Match.length,Pattern]]
      end,
      nl        
   end,

   println(AllMatch.sort_down()),

   nl.

% Test pattern
go1 ?=>
  nl.

go1 => true.

%
% using check2: 9.756s
% [26,abcde],[23,rstuv],[23,lmnop],[6,efghi],[3,klmno],[1,defgh]]
%
go2 =>
   Words = read_file_lines("words_lower.txt"),
   % Words = read_file_lines("/usr/share/dict/words"),
   Alpha = lower(),
   Len = 5,
   AllMatch = [],
   Patterns = [ [Alpha[J] : J in I..I+Len-1] : I in 1..Alpha.length-Len+1],
   foreach(Pattern in Patterns) 
      println(pattern=Pattern),
      Match = [],
      foreach(Word in Words, check2(Word,Pattern))
        Match := Match ++ [Word]
      end,
      println(Match),
      println(len=Match.length),
      if Match.length > 0 then
         AllMatch := AllMatch ++ [[Match.length,Pattern]]
      end,
      nl        
   end,

   println(AllMatch.sort_down()),
   nl.


go2b =>
   Pattern = "abcde",
   Words = check_word_list("words_lower.txt",Pattern),
   println(Words),
   println(len=Words.length),
   nl.


%
% using check3: 1.51s
% [[26,abcde],[23,rstuv],[23,lmnop],[6,efghi],[3,klmno],[1,defgh]]
%
go3 =>
   Words = read_file_lines("words_lower.txt"),
   % Words = read_file_lines("sv_spelling_org_utf8.txt"),   
   % Words = read_file_lines("/usr/share/dict/words"),
   println(numWords=Words.len),
   Alpha = lower(),
   Len = 5,
   AllMatch = [],
   Patterns = [ [Alpha[J] : J in I..I+Len-1] : I in 1..Alpha.length-Len+1],
   foreach(Pattern in Patterns) 
      println(pattern=Pattern),
      Match = [],
      foreach(Word in Words, check3(Word,Pattern))
         Match := Match ++ [Word]
      end,
      println(Match),
      println(len=Match.length),
      if Match.length > 0 then
         AllMatch := AllMatch ++ [[Match.length,Pattern]]
      end,
      nl        
   end,

   println(AllMatch.sort_down()),

   nl.

%
% Variant using list comprehension (and a slightly different presentation): 1.712s
%
go3b =>
   % Words = read_file_lines("words_lower.txt"),
   % Words = read_file_lines("/usr/share/dict/words"),
   Words = read_file_lines("/home/hakank/public_html/combograms/sv_spelling_org_utf8.txt"),   
   Alpha = lower(),
   Len = 5,
   Patterns = [ [Alpha[J] : J in I..I+Len-1] : I in 1..Alpha.length-Len+1],
   AllMatch = [[Matches.len,Pattern,Matches] :  Pattern in Patterns, Matches=[Word : Word in Words, check3(Word,Pattern)], Matches.len > 0].sort_down(),
   println(AllMatch.join("\n")),

   nl.

%
% slower due to the overhead of call/3: 21.297s
%
go4 =>
  test_all("words_lower.txt",5,check3).

% Just a test of check_dcg/3 with "abc"
go5 ?=>
  Words = read_file_lines("words_lower.txt"),
  % Words = read_file_lines("sv_spelling_org_utf8.txt"),
  foreach(Word in Words,check_dcg(Word,[])) 
    println(Word)
  end,
  nl.

go5 => true.

%
% Testing check_dcg2/3: This don't work correctly!
%
go6 ?=>
  Len = 5,
  Alpha = lower(),
  % Words = read_file_lines("sv_spelling_org_utf8.txt"),
  Words = read_file_lines("words_lower.txt"),
  Patterns = [ [Alpha[J] : J in I..I+Len-1] : I in 1..Alpha.length-Len+1],
  Counts = [],
  foreach(Pattern in Patterns)
    print(Pattern),
    Matches = [],
    foreach(Word in Words,check_dcg2(Pattern,Word,[]))
       Matches := Matches ++ [Word]
    end,
    println(Matches),
    MatchesLen = Matches.len,
    println(len=MatchesLen),
    if MatchesLen > 0 then
      Counts := Counts ++ [Pattern=MatchesLen]
    end
  end,
  println(Counts),
  nl.

go6 => true.

%
% check_gcd3: 5.482s
% check_gcd4: 3.88s
%
go7 ?=>
  Alpha = lower(),
  Len = 5,  
  Patterns = [ [Alpha[J] : J in I..I+Len-1] : I in 1..Alpha.length-Len+1],
  Words = read_file_lines("words_lower.txt"),    
  % Words = read_file_lines("unixdict.txt"),
  % Words = read_file_lines("sv_spelling_org_utf8.txt"),
  Counts = [],
  foreach(Pattern in Patterns)
    println(Pattern),
    Matches = [],
    % foreach(Word in Words,check_dcg3(Pattern,Word,[]))
    foreach(Word in Words,check_dcg4(Pattern,Word,[]))    
      Matches := Matches ++ [Word]
    end,
    println(Matches),
    MatchesLen = Matches.len,
    println(len=MatchesLen),
    if MatchesLen > 0 then
      Counts := Counts ++ [Pattern,MatchesLen]
    end
  end,
  println(Counts),
  nl.

go7 => true.


%
% Here are some explorations on DCG.
% gcd3/3 is the only that is reasonable fast.
%
alphabet(A) =>
   A = [C : C in "abcdefghijklmnopqrstuvwxyzåäö"].

alphabet() = "abcdefghijklmnopqrstuvwxyzåäö".

% Just testing "*.a.*b.*c.*"
check_dcg --> any_star, a, any_star, b, any_star, c, any_star, d, any_star.


a --> ['a'].
b --> ['b'].
c --> ['c'].
d --> ['d'].
e --> ['e'].
f --> ['f'].
g --> ['g'].
h --> ['h'].
i --> ['i'].
j --> ['j'].
k --> ['k'].
l --> ['l'].
m --> ['m'].
n --> ['n'].
o --> ['o'].
p --> ['p'].
q --> ['q'].
r --> ['r'].
s --> ['s'].
t --> ['t'].
u --> ['u'].
v --> ['v'].
w --> ['w'].
x --> ['x'].
y --> ['y'].
z --> ['z'].
aa --> ['å'].
ae --> ['ä'].
oe --> ['ö'].

any_star --> "" ; any_star1.

any_star1 --> char, any_star.
char --> [C], {membchk(C,alphabet())}.

char(C) --> [C], {membchk(C,alphabet())}.


% Another approach: Don't work correctly
check_dcg2([H|Tail]) --> any_star, char(H), check_dcg2(Tail).
check_dcg2([]) --> [].

% Another approach: Quite fast.
any_star3 --> [] ; ([C], { char(C) },  any_star3).
check_dcg3([C|Cs]) --> any_star3, char(C), check_dcg3(Cs).
check_dcg3([]) --> any_star3.

% Same as check_dcg3, but without checking with char(C).
% Faster than check_dcg3.
any_star4 --> [] ; [_C],  any_star4.
check_dcg4([C|Cs]) --> any_star4, [C], check_dcg4(Cs).
check_dcg4([]) --> any_star4.


lower() = [chr(I+96) : I in 1..26].
upper() = [chr(I+64) : I in 1..26].

lower_swe() = [chr(I+96) : I in 1..26] ++ "åäö".


%
% Check if the pattern P match the string String. 
% Pattern P corresponds to the regular expression
%    /.*P[1].*P[2].*P[3].*...P[P.length].*/
%
check(String,Pattern) =>
   S = String,
   PLen = Pattern.length,
   Ix = 1,
   Found = 0,
   % while (Ix <= PLen, once(find(S,Pattern[Ix],From,_To)))
   while (Ix <= PLen, find2(S,Pattern[Ix],From,_To))
      SS = substr(S,From),
      S := SS,
      Found := Found + 1,
      Ix := Ix + 1
   end,
   Found == Pattern.length.


%
% Another approach: about the same time as check/2.
%
check2(String,Pattern) =>
   PLen = Pattern.length,
   SIx = 1,
   PIx = 1,
   Found = 0,
   while (SIx <= String.length, PIx <= PLen, Found < PLen)
      if String[SIx] == Pattern[PIx] then 
         Found := Found + 1,
         PIx := PIx + 1
      end,
      SIx := SIx + 1
   end,
   Found == PLen.


%
% check3/3: Plain recursive version.
% Much faster - and smaller - than check/2 and check2/2.
%

% check each character in the string against the pattern
check3([SH|ST],P@[PH|PT]) => 
  (SH == PH -> 
    check3(ST,PT)
   ; 
    check3(ST,P)
  ).

% empty pattern (this is the goal)
check3(_S,[]) => 
  true.


%
% General testing of the predicates. 
% However the use of call/n slows it down considerably.
% E.g. test_all("words_lower.txt",5,check3) takes 21.297s, compared to go3's 1.5s
%
test_all(Dict,Len,CheckPred) =>
   Words = read_file_lines(Dict),
   Alpha = lower(),
   AllMatch = [],
   Patterns = [ [Alpha[J] : J in I..I+Len-1] : I in 1..Alpha.length-Len+1],
   foreach(Pattern in Patterns) 
      println(pattern=Pattern),
      Match = [],
      foreach(Word in Words,call(CheckPred,Word,Pattern))
         Match := Match ++ [Word]
      end,
      println(Match),
      println(len=Match.length),
      if Match.length > 0 then
         AllMatch := AllMatch ++ [[Match.length,Pattern]]
      end,
      nl        
   end,

   println(AllMatch.sort_down()),

   nl.


substr(S, From) = [S[I] : I in From..S.length].
substr(S, From, To) = [S[I] : I in From..To].

check_word_list(File,Pattern) = Words =>
    FD = open(File),    
    Words = [],
    while (not at_end_of_stream(FD)) 
       Word := read_line(FD),
       if check(Word,Pattern) then
          % println(Word),
          Words := Words ++ [Word]
       end
    end,
    close(FD).


%
% find/4 requires that the Substring is a string.
%
find2(String, Substring, From, To), char(Substring) => 
  find(String, Substring.to_string(), From, To).

find2(String, Substring, From, To) => 
  find(String, Substring, From, To).