Add multiset-distance and unigram-bigram-distance, and use the latter in bmx

This commit is contained in:
Lee Spector 2023-12-10 12:37:23 -05:00
parent 39edec3865
commit 49f6f6de72
2 changed files with 22 additions and 1 deletions

View File

@ -124,3 +124,24 @@
(let [distance (levenshtein-distance seq1 seq2) (let [distance (levenshtein-distance seq1 seq2)
max-distance (max (count seq1) (count seq2))] max-distance (max (count seq1) (count seq2))]
(math/div (- max-distance distance) max-distance)))) (math/div (- max-distance distance) max-distance))))
(defn multiset-distance
"Returns the total of the differences between the counts of all items across
the provided multisets."
[ms1 ms2]
(loop [total 0
remaining (clojure.set/union (set ms1) (set ms2))]
(if (empty? remaining)
total
(recur (+ total
(math/abs (- (count (filter (partial = (first remaining)) ms1))
(count (filter (partial = (first remaining)) ms2)))))
(rest remaining)))))
(defn unigram-bigram-distance
"Returns the distance between two sequences, calculated as the sum of the multiset
distance between the items (unigrams) in the sequences and half of the multiset
distance between the adjacent pairs (bigrams) in the sequences."
[seq1 seq2]
(+ (multiset-distance seq1 seq2)
(* 0.5 (multiset-distance (partition 2 1 seq1) (partition 2 1 seq2)))))

View File

@ -173,7 +173,7 @@ The function `new-individual` returns a new individual produced by selection and
(flatten (interpose :gap (flatten (interpose :gap
(mapv (fn [g] (mapv (fn [g]
(if (< (rand) rate) (if (< (rand) rate)
(apply min-key #(metrics/levenshtein-distance g %) b-genes) (apply min-key #(metrics/unigram-bigram-distance g %) b-genes)
g)) g))
a-genes))))) a-genes)))))