/* Utils for the nn module in Picat. The basis for this is to use ARFF files (se parse_arff.pi for more info): - Right now, the focus is on classification problems. - The ARFF parser don't handle NAs (i.e. '?') in the dataset. - parsing the ARFF file and create two files * base.arff_train.data * base.arff_test.data * base.arff.net See parse_arff.pi for more details. - For each dataset one must write a specific program with the following: * go/0 (or main/0) This must include the following (from train_iris.pi) Base = "iris.arff", Map = new_map([1='Iris-setosa',2='Iris-versicolor',3='Iris-virginica']), train_and_test(Base, Map), nl. * train_net(TrainFile, NetFile,NumInputs,NumOutputs) which includes all the parameters to train the dataset. This is then called in train_and_test/2 from this module. See train_*.pi for examples. This Picat model was created by Hakan Kjellerstrand, hakank@gmail.com See also my Picat page: http://www.hakank.org/picat/ */ module nn_hakank. import nn. import util. % For numeric predictions train_and_test(Base) => Map = new_map(), train_and_test(Base,Map). train_and_test(Base, Map) => TrainFile = Base ++ "_train.data", [NumInputs,NumOutputs] = read_format_line(TrainFile), println([numInputs=NumInputs,numOutputs=NumOutputs]), TestFile = Base ++ "_test.data", NetFile = Base ++ ".net", time(train_net(TrainFile, NetFile,NumInputs,NumOutputs)), if NumOutputs > 1 then time(test_net(TestFile,NetFile,Map)) else time(test_net_numeric(TestFile,NetFile)) end, nl. % % test_net(TestFile, NetFile,CategoryMap) % % for classification problems (# outputs > 1) % test_net(TestFile, NetFile,CategoryMap) => NN = nn_load(NetFile), TestData = nn_train_data_load(TestFile), println(testData=TestData), NumCorrect = 0, NumTests = 0, Len = CategoryMap.keys.len, ConfusionMatrix = new_array(Len,Len), bind_vars(ConfusionMatrix,0), foreach(D in TestData) NumTests := NumTests + 1, Output=D[2], % println([input=D[1],output=D[2]]), Predict = nn_run(NN,D[1]), % println(predict=Predict), SelOutput = max_arg(Output), SelPredict = max_arg(Predict), if SelOutput == SelPredict then NumCorrect := NumCorrect + 1 end, println([output=Output,predict=Predict]), println([output=SelOutput,predict=SelPredict,num_correct=NumCorrect,num_tests=NumTests, pct=(NumCorrect/NumTests)]), nl, ConfusionMatrix[SelOutput,SelPredict] := ConfusionMatrix[SelOutput,SelPredict] + 1 end, println(result=[num_correct=NumCorrect,num_tests=NumTests, pct=(NumCorrect/NumTests)]), nl, println("Confusion matrix:"), foreach(Row in ConfusionMatrix) println(Row) end, nl, nn_destroy_all. % % test_net_numeric(TestFile, NetFile) % % for numeric prediction % test_net_numeric(TestFile, NetFile) => NN = nn_load(NetFile), TestData = nn_train_data_load(TestFile), NumTests = 0, TotDiff = 0, foreach(D in TestData) NumTests := NumTests + 1, Input = D[1], Output=D[2,1], println([input=D[1],output=Output]), Predict1 = nn_run(NN,Input,$[scaleIn(1),scaleOut(-1),resetMSE]), Predict = Predict1[1], Diff = abs(Predict-Output), TotDiff := TotDiff + Diff, println([output=Output,predict=Predict,num_tests=NumTests, diff=Diff, totDiff=TotDiff]), nl end, println(result=[num_correct=NumCorrect,num_tests=NumTests, totDiff=TotDiff]), println(totDiff=TotDiff), nl, nn_destroy_all. max_arg(L) = Val => max(L.to_list) = MaxVal, Val = first([I : I in 1..L.len, L[I] == MaxVal]). read_format_line(File) = [NumInputs,NumOutputs] => FH = open(File), Line = read_line(FH), close(FH), [NumInstances,NumInputs,NumOutputs] = [I.to_int() : I in Line.split()]. %% %% For parsing an ARFF file. %% parse_and_split_file(File) => parse_and_split_file(File, -1, 0.8, 0). parse_and_split_file(File,ClassIx) => parse_and_split_file(File, ClassIx, 0.8, 0). parse_and_split_file(File,ClassIx,Pct) => parse_and_split_file(File, ClassIx, Pct, 0). parse_and_split_file(File, ClassIx, Pct, NoShuffle) => [Data,Format,Info] = parse_arff(File,ClassIx), println(format=Format), %% %% Info: TODO %% The idea is to use this information to %% simplify training and testing. %% But right now I'm not sure how to represent it (and read it back). %% println("Info:"), Info.put(pct,Pct), Info.put(no_shuffle,NoShuffle), println(info=Info), /* foreach(Key=Value in Info) if Key == attributes then print_attribute_info(Value) elseif map(Value) then foreach(Key2=Value2 in Value) if map(Value2) then foreach(Key3=Value3 in Value2) printf("%w: %w\n", Key3,Value3) end else printf("%w: %w\n", Key2,Value2) end end else printf("%w: %w\n", Key,Value) end end, println("end_info"), */ if NoShuffle == 1 then DataShuffled = Data else DataShuffled = shuffle(Data) end, if Pct < 1 then [DataTrain,DataTest] = split_dataset(DataShuffled,Pct), TrainFormat := copy_term(Format), TrainFormat[1] := DataTrain.len, TestFormat := copy_term(Format), TestFormat[1] := DataTest.len, println(train_len=DataTrain.len), println(test_len=DataTest.len) else DataTrain = DataShuffled, println(train_len=DataTrain.len), TrainFormat := copy_term(Format), TrainFormat[1] := DataTrain.len end, TrainFile = File ++ "_train.data", println(train_file=TrainFile), TrainHandle = open(TrainFile,write), if Pct < 1 then TestFile = File ++ "_test.data", TestHandle = open(TestFile,write), println(test_file=TestFile) end, println("Generating training set"), println(TrainHandle,[TrainFormat[I].to_string : I in 1..3].join(' ')), foreach([In,Out] in DataTrain) println(TrainHandle,In), println(TrainHandle,Out) end, close(TrainHandle), if Pct < 1 then println("Generating Test set"), println(TestHandle,[TestFormat[I].to_string : I in 1..3].join(' ')), foreach([In,Out] in DataTest) println(TestHandle,In), println(TestHandle,Out) end end, if Pct < 1 then close(TestHandle) end, nl. %% %% parse_arff(File, ClassIx) = Data %% %% Parse an ARFF file and returns the data as a list of [Input, Output] pairs %% %% ClassIx = -1, % -1 last attribute in data is the class %% parse_arff(File) = parse_arff(File,-1). parse_arff(File,ClassIx) = [Data,Format,Info] => println($parse_arff(File,ClassIx)), Lines = read_file_lines(File), GotData = false, AttributesMap = new_map(), AttributesList = [], AttributeIx = 1, ClassName = _, NumAttributes = 0, Data = [], Encodings = new_map(), NumInputValues = _, NumOutputValues = _, Info = new_map(), Info.put(class_index,ClassIx), Info.put(attributes,new_map()), Classification = false, foreach(Line in Lines, Line.len > 0, Line[1] != '%') % println(line=Line), if Line[1] == '@' then Header = split(Line), Key = Header[1].to_lowercase, %% @Data if Key == "@data" then GotData := true, if ClassIx == -1 then ClassIx := AttributesList.len, Info.put(class_index,AttributesList.len) end, ClassName := AttributesList[ClassIx], NumAttributes := AttributesList.len % , % NumInputValues = NumAttributes-1 %% @Attribute elseif Key == "@attribute" then Name = Header[2], Type = Header[3], P1 = find_first_of(Line,'{'), P2 = find_first_of(Line,'}'), IsCategory = false, if P1 > 0, P2 > 0 then Name := Header[2], Type := [Line[P] : P in P1+1..P2-1], Type := delete_all(Type,' '), IsCategory := true, Classification := true end, AttributesList := AttributesList ++ [Name], AttributesMap.put(Name,AttributeIx), AttributeIx := AttributeIx + 1, %% Categorical? if IsCategory then TypeList = split(Type,","), HotEncoding = hot_encode(TypeList), Encodings.put(Name,HotEncoding), Info.get(attributes).put(Name,HotEncoding) else Info.get(attributes).put(Name,Type.to_lowercase) end %% @Relation elseif Key == "@relation" then true end else %% %% Data lines %% if GotData then DataLine = split(Line,","), if find_first_of(Line,'?') >= 0 then println("Sorry, the file includes NAs (?) which is not handled."), println("Here's the offending line:"), println(Line), halt end, foreach({A,I} in zip(AttributesList,1..NumAttributes)) if Encodings.has_key(A) then DataLine[I] := [E.to_string() : E in Encodings.get(A).get(DataLine[I])].join(' ') end end, if var(NumOutputValues) then NumOutputValues := split(DataLine[ClassIx]).len %% NumOutputValues := DataLine[ClassIx].len end, Input = [DataLine[I].to_string() : I in 1..NumAttributes, I != ClassIx].join(' '), NumInputValues := Input.split().len, if Classification then Output = [T.to_string() : T in DataLine[ClassIx]].join(' ') else Output = [T : T in DataLine[ClassIx]] end, println(output=Output), Data := Data ++ [[Input,Output]] end end end, NumInstances = Data.len, Info.put(num_instances,NumInstances), Info.put(num_input_values,NumInputValues), Info.put(num_output_values,NumOutputValues), Format = [NumInstances,NumInputValues,NumOutputValues]. %% %% Encode the nominal values in list L as %% hot_encode: %% 0,0,...1,...,0,0 %% OR optional %% -1,-1,...1,..,1,-1 %% hot_encode(L) = hot_encode(L, 0). hot_encode(L, Zero) = HotEncoding => Len = L.len, HotEncoding = new_map(), foreach(I in 1..Len) E = [Zero : _ in 1..Len], E[I] := 1, HotEncoding.put(L[I], E) end. % % shuffle(List) = ShuffledList % % ShuffledList is a random shuffled List % shuffle(List) = List2 => List2 = List, Len = List.length, _ = random2(), foreach(I in 1..Len) R2 = 1+(random() mod Len), List2 := swap(List2,I,R2) end. % % Swap position I <=> J in list L % swap(L,I,J) = L2, list(L) => L2 = L, T = L2[I], L2[I] := L2[J], L2[J] := T. % % split_dataset(Dataset, Pct) = [Data1,Data2] % % Split the dataset Dataset in two datasets % with a split of Pct percent. % split_dataset(Dataset, Pct) = [Data1,Data2] => Len = length(Dataset), First = round(Len*Pct), Data1 = [Dataset[I] : I in 1..First], Data2 = [Dataset[I] : I in First+1..Len]. print_attribute_info(Info) => println("attributes_start"), foreach(Key=Value in Info) printf("attribute: %w\n", Key), if map(Value) then foreach(Key2=Value2 in Value) printf("%w: %w\n", Key2,[V.to_string : V in Value2].join(' ')) end else printf("%w\n",Value) end, println("attribute_end") end, println("attributes_end").