/* 

  Word frequency (Rosetta code) in Picat.

  http://rosettacode.org/wiki/Word_frequency
  """
  Task

  Given a text file and an integer n, print/display the n most common words in the file 
  (and the number of their occurrences) in decreasing frequency.


  For the purposes of this task:

  - A word is a sequence of one or more contiguous letters.
  - You are free to define what a   letter   is.
  - Underscores, accented letters, apostrophes, hyphens, and other special characters can be handled at your discretion.
  - You may treat a compound word like   well-dressed   as either one word or two.
  - The word   it's   could also be one or two words as you see fit.
  - You may also choose not to support non US-ASCII characters.
  - Assume words will not span multiple lines.
  - Don't worry about normalization of word spelling differences.
  - Treat   color   and   colour   as two distinct words.
  - Uppercase letters are considered equivalent to their lowercase counterparts.
  - Words of equal frequency can be listed in any order.
  - Feel free to explicitly state the thoughts behind the program decisions.


  Show example output using Les Misérables from Project Gutenberg as the text file input and display 
  the top   10   most used words. 
  """

  https://www.gutenberg.org/files/135/135-0.txt

  This program was created by Hakan Kjellerstrand, hakank@gmail.com
  See also my Picat page: http://www.hakank.org/picat/

*/

import util.

main => go.

/*
split_type = all
[the = 40907,of = 19830,and = 14872,a = 14487,to = 13872,in = 11157,he = 9645,was = 8618,that = 7908,it = 6626]

split_type = space_punct
[the = 40193,of = 19779,and = 14668,a = 14227,to = 13538,in = 11033,he = 9455,was = 8604,that = 7576,” = 6578]

split_type = space
[the = 40193,of = 19747,and = 14402,a = 14222,to = 13512,in = 10964,he = 9211,was = 8345,that = 7235,his = 6414]

*/
go =>
  NBest = 10,
  File = "les_miserables.txt",
  Chars = read_file_chars(File),

  % Remove Project Gutenberg header/footer
  find(Chars,"*** START OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES ***",_,HeaderEnd),  
  find(Chars,"*** END OF THE PROJECT GUTENBERG EBOOK LES MISÉRABLES ***",FooterStart,_),

  Book = [to_lowercase(C) : C in slice(Chars,HeaderEnd+1,FooterStart-1)],

  % Split into words
  member(SplitType,[all,space_punct,space]),
  println(split_type=SplitType),
  split_chars(SplitType,SplitChars),
  Words = split(Book,SplitChars),

  % println(freq=freq(Words)),

  % Count frequencies and present result
  println(freq(Words).to_list.sort_down(2).take(NBest)),
  nl,
  fail.


split_chars(all,"\n\r \t,;!.?()[]”\"-“—-__‘’*").
split_chars(space_punct,"\n\r \t,;!.?").
split_chars(space,"\n\r \t").


% Include header / footer info
/*
split_type = all
[the = 41094,of = 19952,and = 14939,a = 14545,to = 13954,in = 11218,he = 9647,was = 8620,that = 7922,it = 6641]

split_type = space_punct
[the = 40378,of = 19901,and = 14734,a = 14284,to = 13620,in = 11094,he = 9457,was = 8606,that = 7590,” = 6578]

split_type = space
[the = 40378,of = 19869,and = 14468,a = 14278,to = 13590,in = 11025,he = 9213,was = 8347,that = 7249,his = 6414]
*/
go2 =>
  NBest = 10,
  File = "les_miserables.txt",
  Book = [to_lowercase(C) : C in read_file_chars(File)],
  member(SplitType,[all,space_punct,space]),
  println(split_type=SplitType),
  split_chars(SplitType,SplitChars),
  Words = split(Book,SplitChars),
  println(freq(Words).to_list.sort_down(2).take(NBest)),
  nl,
  fail.


%
% Freq is a map of the frequency of each element in L
%
freq(L) = Freq =>
  Freq = new_map(),
  foreach(E in L)
    Freq.put(E,Freq.get(E,0)+1)
  end.