From e3ef43e95a38d45c789dcbb0d521d6b37e54e439 Mon Sep 17 00:00:00 2001
From: Ryan Boldi <ryan.boldi123@gmail.com>
Date: Mon, 12 Dec 2022 11:53:14 -0500
Subject: [PATCH] overhaul to down-sampling. Separated selection from the
 down-sampling type.

Also, added to the docs to help startup faster
---
 doc/downsample.md                             | 63 ++++++++++++++-----
 src/propeller/downsample.cljc                 | 33 +---------
 src/propeller/gp.cljc                         | 34 +++++-----
 .../problems/simple_classification.cljc       |  2 +
 src/propeller/selection.cljc                  |  3 +-
 5 files changed, 66 insertions(+), 69 deletions(-)
diff --git a/doc/downsample.md b/doc/downsample.md
index 62c2741..7ea1b55 100644
--- a/doc/downsample.md
+++ b/doc/downsample.md
@@ -1,27 +1,56 @@
-# Downsampling the Training Data
+Downsampling the Training Data
+=
 
 Downsampling is a very simple way to improve the efficiency of your evolutionary runs. It might allow for deeper evolutionary searches and a greater success rate.
 
-Using Downsampled-Lexicase selection with propeller is easy:
+Using Downsampled selection with propeller is easy:
+
+Set the :parent-selection argument to whichever selection strategy you would like, and set the :downsample? argument to true as follows:
 
-Set the :parent-selection argument to :ds-lexicase as follows
 ```clojure
-lein run -m propeller.problems.simple-regression :parent-selection :ds-lexicase <ARGS>
+lein run -m propeller.problems.simple-regression :parent-selection :lexicase :downsample? true
 ```
 
-Arguments:
+The number of evaluations is held constant when comparing to a full training set run, so set the :max-generations to a number of generations that you would have gone to using a **full** sample.
+
+## Downsample Functions
+
+In this repository, you have access to 3 different downsampling functions. These are the methods used to take a down-sample from the entire training set.
+
+To use them, add the argument ```:ds-function``` followed by which function you would like to us
+
+The list is
+- ```:case-maxmin``` - This is the method used for informed down-sampled lexicase selection
+- ```:case-maxmin-auto``` - This method automatically determines the downsample size
+- ```:case-rand```- Random Sampling
+
+### Using ```:case-maxmin```:
+
+In order to use regular informed down-sampled selection, you must specify a few things:
+- ```:downsample-rate```- This is the $r$ parameter: what proportion of the full sample should be in the down-sample $\in [0,1]$
+- ```:ds-parent-rate``` - This is the $\rho$ parameter: what proportion of parents are used to evaluate case distances $\in [0,1]$
+- ```:ds-parent-gens``` - This is the $k$ parameter: How many generations in between parent evaluations for distances $\in \{1,2,3, \dots\}$
+
+### Using ```:case-maxmin-auto```:
+
+In order to use automatic informed down-sampled selection, you must specify a few things:
+- ```:case-delta ```- This is the $\Delta$ parameter: How close can the farthest case be from its closest case before we stop adding to the down-sample
+- ```:ids-type``` - Either ```:elite``` or ```:solved ``` - Specifies whether we are using elite/not-elite or solved/not-solved as our binary-fication of case solve vectors.
+- ```:ds-parent-rate``` - This is the $\rho$ parameter: what proportion of parents are used to evaluate case distances $\in [0,1]$
+- ```:ds-parent-gens``` - This is the $k$ parameter: How many generations in between parent evaluations for distances $\in \{1,2,3, \dots\}$
+
+### Using ```:case-rand```:
+
+In order to use regular randomly down-sampled selection, you must specify a few things:
+- ```:downsample-rate```- This is the $r$ parameter: what proportion of the full sample should be in the down-sample $\in [0,1]$
 
 
-- Case Downsampling function:
-    - Random sampling (default)
-    - Case tournament selection
-         ```clojure 
-        :ds-function :case-tournament 
-        ```
-    - Case Lexicase Selection
-        WIP
-- Downsample Rate:
-    ```clojure 
-        :downsample-rate 0.1
-    ```    
+
+
+
+Here's an example of running informed downsampled lexicase selection with $r=0.1$, $\rho=0.01$ and $k=100$ on the simple classification problem:
+
+```clojure
+lein run -m propeller.problems.simple-classification :parent-selection :lexicase :downsample? true :ds-function :case-maxmin :downsample-rate 0.1 :max-generations 300 :ds-parent-rate 0.01 :ds-parent-gens 100
+```
 
diff --git a/src/propeller/downsample.cljc b/src/propeller/downsample.cljc
index 151cec2..ca13cec 100644
--- a/src/propeller/downsample.cljc
+++ b/src/propeller/downsample.cljc
@@ -20,28 +20,6 @@
   [training-data {:keys [downsample-rate]}]
   (take (int (* downsample-rate (count training-data))) (shuffle training-data)))
 
-(defn select-downsample-avg
-  "uses case-tournament selection to select a downsample that is biased to being spread out"
-  [training-data {:keys [downsample-rate case-t-size]}]
-  (let [shuffled-cases (shuffle training-data)
-        goal-size (int (* downsample-rate (count training-data)))]
-    (loop [new-downsample (conj [] (first shuffled-cases))
-           cases-to-pick-from (rest shuffled-cases)]
-      ;(prn {:new-downsample new-downsample :cases-to-pick-from cases-to-pick-from})
-      (if (>= (count new-downsample) goal-size)
-        new-downsample
-        (let [tournament (take case-t-size cases-to-pick-from)
-              rest-of-cases (drop case-t-size cases-to-pick-from)
-              case-distances (metrics/mean-of-colls
-                              (map (fn [distance-list]
-                                     (utils/filter-by-index distance-list (map #(:index %) tournament)))
-                                   (map #(:distances %) new-downsample)))
-              selected-case-index (metrics/argmax case-distances)]
-          (prn {:avg-case-distances case-distances :selected-case-index selected-case-index})
-          (recur (conj new-downsample (nth tournament selected-case-index))
-                 (shuffle (concat (utils/drop-nth selected-case-index tournament)
-                                  rest-of-cases))))))))
-
 (defn select-downsample-maxmin
   "selects a downsample that has it's cases maximally far away by sequentially 
    adding cases to the downsample that have their closest case maximally far away"
@@ -58,10 +36,6 @@
                                      (utils/filter-by-index distance-list (map #(:index %) tournament)))
                                    (map #(:distances %) new-downsample)))
               selected-case-index (metrics/argmax min-case-distances)]
-          (if (sequential? (:input1 (first new-downsample)))
-            (prn {:cases-in-ds (map #(first (:input1 %)) new-downsample) :cases-in-tourn (map #(first (:input1 %)) tournament)})
-            (prn {:cases-in-ds (map #(:input1 %) new-downsample) :cases-in-tourn (map #(:input1 %) tournament)}))
-          (prn {:min-case-distances min-case-distances :selected-case-index selected-case-index})
           (recur (conj new-downsample (nth tournament selected-case-index))
                  (shuffle (utils/drop-nth selected-case-index tournament))))))))
 
@@ -81,13 +55,8 @@
             selected-case-index (metrics/argmax min-case-distances)]
         (if (or (= 0 (count tournament)) (<= (apply max min-case-distances) case-delta))
           new-downsample
-          (do
-            (if (sequential? (:input1 (first new-downsample)))
-              (prn {:cases-in-ds (map #(first (:input1 %)) new-downsample) :cases-in-tourn (map #(first (:input1 %)) tournament)})
-              (prn {:cases-in-ds (map #(:input1 %) new-downsample) :cases-in-tourn (map #(:input1 %) tournament)})) 
-            ;(prn {:min-case-distances min-case-distances :selected-case-index selected-case-index})
           (recur (conj new-downsample (nth tournament selected-case-index))
-                 (shuffle (utils/drop-nth selected-case-index tournament)))))))))
+                 (shuffle (utils/drop-nth selected-case-index tournament))))))))
 
 (defn get-distance-between-cases
   "returns the distance between two cases given a list of individual error vectors, and the index these
diff --git a/src/propeller/gp.cljc b/src/propeller/gp.cljc
index ff4cea1..cf16e97 100644
--- a/src/propeller/gp.cljc
+++ b/src/propeller/gp.cljc
@@ -17,13 +17,14 @@
 
 (defn report
   "Reports information each generation."
-  [evaluations pop generation argmap]
+  [evaluations pop generation argmap training-data]
   (let [best (first pop)]
     (clojure.pprint/pprint {:generation            generation
                             :best-plushy           (:plushy best)
                             :best-program          (genome/plushy->push (:plushy best) argmap)
                             :best-total-error      (:total-error best)
                             :evaluations           evaluations
+                            :ds-indices            (map #(:index %) training-data)
                             :best-errors           (:errors best)
                             :best-behaviors        (:behaviors best)
                             :genotypic-diversity   (float (/ (count (distinct (map :plushy pop))) (count pop)))
@@ -35,12 +36,13 @@
 (defn gp
   "Main GP loop."
   [{:keys [population-size max-generations error-function instructions
-           max-initial-plushy-size solution-error-threshold mapper ds-parent-rate ds-parent-gens dont-end ids-type]
+           max-initial-plushy-size solution-error-threshold mapper ds-parent-rate ds-parent-gens dont-end ids-type downsample?]
     :or   {solution-error-threshold 0.0
            dont-end false
            ds-parent-rate 0
            ds-parent-gens 1
            ids-type :solved ; :solved or :elite
+           downsample? false
            ;; The `mapper` will perform a `map`-like operation to apply a function to every individual
            ;; in the population. The default is `map` but other options include `mapv`, or `pmap`.
            mapper #?(:clj pmap :cljs map)}
@@ -55,12 +57,12 @@
                      (fn [_] {:plushy (genome/make-random-plushy instructions max-initial-plushy-size)})
                      (range population-size))
          indexed-training-data (downsample/assign-indices-to-data (downsample/initialize-case-distances argmap))]
-    (let [training-data (if (= (:parent-selection argmap) :ds-lexicase)
+    (let [training-data (if downsample?
                           (case (:ds-function argmap)
-                            :case-avg (downsample/select-downsample-avg indexed-training-data argmap)
                             :case-maxmin (downsample/select-downsample-maxmin indexed-training-data argmap)
                             :case-maxmin-auto (downsample/select-downsample-maxmin-adaptive indexed-training-data argmap)
-                            (downsample/select-downsample-random indexed-training-data argmap))
+                            :case-rand (downsample/select-downsample-random indexed-training-data argmap)
+                            (do (prn {:error "Invalid Downsample Function"}) (downsample/select-downsample-random indexed-training-data argmap)))
                           indexed-training-data) ;defaults to random
           parent-reps (if (zero? (mod generation ds-parent-gens)) ;every ds-parent-gens generations
                         (take (* ds-parent-rate (count population)) (shuffle population))
@@ -74,21 +76,17 @@
                                      (partial error-function argmap training-data)
                                      population))
           best-individual (first ds-evaluated-pop)
-          best-individual-passes-ds (and (= (:parent-selection argmap) :ds-lexicase) (<= (:total-error best-individual) solution-error-threshold))]
-      (prn {:ds-indices-list (map #(:index %) training-data)})
-      ;(if (sequential? (:input1 (first training-data)))
-        ;(prn {:ds-inputs (map #(first (:input1 %)) training-data)})
-        ;(prn {:ds-inputs (map #(:input1 %) training-data)}))
+          best-individual-passes-ds (and downsample? (<= (:total-error best-individual) solution-error-threshold))]
       (if (:custom-report argmap)
         ((:custom-report argmap) evaluations ds-evaluated-pop generation argmap)
-        (report evaluations ds-evaluated-pop generation argmap))
+        (report evaluations ds-evaluated-pop generation argmap training-data))
       ;;did the indvidual pass all cases in ds?
       (when best-individual-passes-ds
         (prn {:semi-success-generation generation}))
       (cond
         ;; Success on training cases is verified on testing cases
         (if (or (and best-individual-passes-ds (<= (:total-error (error-function argmap indexed-training-data best-individual)) solution-error-threshold))
-                     (and (not= (:parent-selection argmap) :ds-lexicase)
+                     (and (not downsample?)
                           (<= (:total-error best-individual) solution-error-threshold)))
                (do (prn {:success-generation generation})
                    (prn {:total-test-error
@@ -100,10 +98,10 @@
                false)
         nil
         ;;
-        (and (not= (:ds-function argmap) :case-maxmin-auto) (>= generation max-generations))
+        (and (not downsample?) (>= generation max-generations))
         nil
         ;;
-        (and (= (:ds-function argmap) :case-maxmin-auto) (>= evaluations (* max-generations population-size (count indexed-training-data))))
+        (and downsample? (>= evaluations (* max-generations population-size (count indexed-training-data))))
         nil
         ;;
         :else (recur (inc generation)
@@ -117,8 +115,8 @@
                                                                           (first reindexed-pop)))
                          (hyperselection/log-hyperselection-and-ret (repeatedly population-size ;need to count occurance of each parent, and reset IDs
                                                                                 #(variation/new-individual reindexed-pop argmap)))))
-                     (if (= (:parent-selection argmap) :ds-lexicase)
-                       (if (zero? (mod generation ds-parent-gens))
-                         (downsample/update-case-distances rep-evaluated-pop indexed-training-data indexed-training-data ids-type) ; update distances every ds-parent-gens generations
-                         indexed-training-data)
+                     (if downsample?
+                      (if (zero? (mod generation ds-parent-gens))
+                        (downsample/update-case-distances rep-evaluated-pop indexed-training-data indexed-training-data ids-type) ; update distances every ds-parent-gens generations
+                        indexed-training-data)
                        indexed-training-data))))))
\ No newline at end of file
diff --git a/src/propeller/problems/simple_classification.cljc b/src/propeller/problems/simple_classification.cljc
index 44e02ab..8e87ce7 100644
--- a/src/propeller/problems/simple_classification.cljc
+++ b/src/propeller/problems/simple_classification.cljc
@@ -88,11 +88,13 @@
        :case-t-size             (count (:train train-and-test-data))
        :ds-parent-rate          0
        :ds-parent-gens          1
+       :ds-function             :case-rand
        :max-generations         500
        :population-size         500
        :max-initial-plushy-size 100
        :step-limit              200
        :parent-selection        :lexicase
+       :downsample?             false
        :tournament-size         5
        :umad-rate               0.1
        :variation               {:umad 1.0 :crossover 0.0}
diff --git a/src/propeller/selection.cljc b/src/propeller/selection.cljc
index 8fbf09f..487ecf2 100755
--- a/src/propeller/selection.cljc
+++ b/src/propeller/selection.cljc
@@ -26,5 +26,4 @@
   [pop argmap]
   (case (:parent-selection argmap)
     :tournament (tournament-selection pop argmap)
-    :lexicase (lexicase-selection pop argmap)
-    :ds-lexicase (lexicase-selection pop argmap)))
+    :lexicase (lexicase-selection pop argmap)))