/* Rosalind problem Finding a Shared Motif in Picat. https://rosalind.info/problems/lcsm/ """ Problem A common substring of a collection of strings is a substring of every member of the collection. We say that a common substring is a longest common substring if there does not exist a longer common substring. For example, "CG" is a common substring of "ACGTACGT" and "AACCGTATA", but it is not as long as possible; in this case, "CGTA" is a longest common substring of "ACGTACGT" and "AACCGTATA". Note that the longest common substring is not necessarily unique; for a simple example, "AA" and "CC" are both longest common substrings of "AACC" and "CCAA". Given: A collection of k (k≤100) DNA strings of length at most 1 kbp each in FASTA format. Return: A longest common substring of the collection. (If multiple solutions exist, you may return any single solution.) Sample Dataset >Rosalind_1 GATTACA >Rosalind_2 TAGACCA >Rosalind_3 ATACA Sample Output AC """ go4/0 is the way to go! Takes about 36s This model was created by Hakan Kjellerstrand, hakank@gmail.com See also my Picat page: http://www.hakank.org/picat/ */ import util. import rosalind_utils. import cp. % import sat. main => go. % % This is too slow for the challenge. % Use go/4 instead. go ?=> garbage_collect(300_000_000), File = "rosalind_lcsm_fasta1.txt", % File := "rosalind_lcsm_fasta_challenge.txt", once(Fs = parse_fasta_file(File)), Fs2 = [F[2] : F in Fs], FsLen = Fs2.len, println(num_fs=Fs2.len), println(lens=[F.len : F in Fs2]), longest_common_substring1(Fs2,Substr,_Len), println(Substr), nl. go => true. % Too slow for the Challenge go2 ?=> File = "rosalind_lcsm_fasta1.txt", once(Fs = parse_fasta(read_file_chars(File))), % println(fs=Fs), Fs2 = [F[2] : F in Fs], println(fs2=Fs2), % Find one of the longest common substrings longest_common_substring1(Fs2,_Substr,Len), % And generate all of them All = findall(Substr2,common_substring1(Fs2,Substr2,Len)), println(All), nl. go2 => true. % % CP: Works for small file but not for the Challenge. % go3 ?=> File = "rosalind_lcsm_fasta1.txt", % File := "rosalind_lcsm_fasta_challenge.txt", once(Fs = parse_fasta_file(File)), Fs2 = [F[2] : F in Fs], % convert to integers Map = new_map(['A'=1,'C'=2,'G'=3,'T'=4]), FsInt = [], foreach(F in Fs2) FsInt := FsInt ++ [[Map.get(C) : C in F]] end, lcs_cp(FsInt.sort(),Substr1,N), println(substr1=Substr1), ACGT = "ACGT", Substr = [ACGT[I] : I in Substr1], println(substr=Substr), println(n=N), nl. go3 => true. % % Using a port from Python. % % This is reasonably fast (<5min): about 35s % go4 ?=> % Strings = ["Oh, hello, my friend.", % "I prefer Jelly Belly beans.", % "When hell freezes over!"], File = "rosalind_lcsm_fasta1.txt", % File := "rosalind_lcsm_fasta_challenge.txt", once(Fs = parse_fasta(read_file_chars(File))), Fs2 = [F[2] : F in Fs], % Substr = long_substr(Fs2.sort()), % with sort: 36.188s Substr = long_substr(Fs2), % without sorting: 32.593s! nl, println(Substr), nl. go4 => true. % % Works with small dataset but too slow for % the challange prolem. % lcs_cp(Fs,X,N) => NumFs = Fs.len, Lens = [F.len : F in Fs], println(lens=Lens), MaxLen=max(Lens), MinLen=min(Lens), println(minLen=MinLen), % TODO: binary search? member(N,MinLen..-1..1), nl, println(n=N), % The substring X = new_list(N), X :: 1..4, % Offset to substring Ix = new_list(NumFs), Ix :: 1..MaxLen, foreach(I in 1..NumFs) println(i=I), Ix[I] :: 1..Lens[I], % Is the substring (X) in this string? foreach(J in 1..N) T #= Ix[I]+J, element(T,Fs[I],X[J]) end end, println(solve), solve($[ffc,split],X++Ix), println(X=Ix). % Python code from % From https://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings long_substr(Data) = Substr => Substr1 = '', if Data.len > 1, Data[1].len > 0 then foreach(I in 1..Data[1].len) foreach(J in 1..Data[1].len-I) DD = Data[1,I..I+J-1], if J > Substr1.len, is_substr(DD,Data) then Substr1 := DD, println(found1=Substr1) end end end end, Substr = Substr1. is_substr(Find,Data) :- foreach(D in Data) once(find(D,Find,_,_)) end. % % These are from longest_common_substring.pi % % Find (one of) the longest common substring/sublist. % This is quite elegant, but too slow. Way too slow! % % (From longest_common_substring.pi) % table(+,-,max) longest_common_substring1(Ls,Substr,Len) => foreach(L in Ls) append(_,Substr,_,L), Substr != [] end, Len = Substr.length. % % Generate all substrings on backtrack. % For getting all longest substrings % it's assumed that Len is fixed % with the length of the longest substring. % common_substring1(Ls,Substr,Len) => foreach(L in Ls) append(_,Substr,_,L), Substr != [] end, Len = Substr.length.