1 files changed, 91 insertions, 0 deletions
diff --git a/src/utils/FrequencyMap.scala b/src/utils/FrequencyMap.scala
new file mode 100644
index 0000000..0bea25c
--- /dev/null
+++ b/src/utils/FrequencyMap.scala
@@ -0,0 +1,91 @@
+package ixee.cryptopals.utils
+
+import ConversionUtils.tup
+
+class FrequencyMap(mapargs: Map[Char, Double]) {
+  sealed trait DiffResult
+  object Inconclusive extends DiffResult
+
+  val FittingThreshold: Double = 0.95
+
+  val mappings = mapargs map tup(_.toLower -> _)
+
+  def at(c: Char): Option[Double] =
+    if (c.isLower)
+      mappings.get(c)
+    else                                    // TODO remove the bias here
+      mappings.get(c.toLower).map(_ * 0.002) // bias HARD against uppercase spam. should make this a per-map setting.
+
+  def diff(other: FrequencyMap): Double = {
+    // pseudoscience here
+    /*
+     * Idea...
+     *   sum difference squares between other and this
+     *   discarding characters that aren't present in other if other's length is < X
+     *
+     * .. divide by other.totalCount?
+    */
+
+    // since this doesn't have sample counts, can't do switching based on other.sampleCount
+    // so just do it.
+
+    other.mappings.foldLeft(0.0) { (confidence: Double, next: (Char, Double)) =>
+      confidence + tup(diffAt _)(next)
+    }
+  }
+
+  // TODO: don't hardcode these..
+  def diffAt(c: Char, charFreq: Double) =
+    c match {
+      case ' ' => 0
+      case '{' | '}' | '`' | '|' =>
+        squared(0.000001 - charFreq) // { } | and ` are very unlikely irl
+      case '[' | ']' =>
+        squared(0.000002 - charFreq) // [ ] are more likely
+      case '"' | '\''=>
+        squared(0.00001 - charFreq) // " or ' are 100% unlikely 90% of the time
+      case '~' | '+' | '=' | '<' | '>' | '/' | '\\' =>
+        squared(0.000004 - charFreq) // math is weird kids
+      case ';' | ':' | '-' | '*' | '(' | '&' | ')' | '_' =>
+        squared(0.000009 - charFreq) // getting into more common punctuation
+      case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' =>
+        squared(0.00001 - charFreq) // numbers are KINDA uncommon
+      case '$' | '%' | '#' | '@' | '!' =>
+        squared(0.00003 - charFreq) // more punctuation...
+      case '.' | ',' =>
+        squared(0.00007 - charFreq) // and the last of the punctuations
+      case '\n' | '\r' =>
+        squared(0.0000002 - charFreq) // explicit \r \n is rare in freeform text.
+      case _ =>
+        this.at(c).map(_ - charFreq).map(squared).getOrElse(0.0)
+    }
+
+  def squared(x: Double) = x * x
+
+  def likelyFitting(other: FrequencyMap) = 1 - (other diff this) > FittingThreshold
+
+  override def toString = mapargs.toString
+}
+
+class SampledFrequencyMap(fm: FrequencyMap, samples: Int) extends FrequencyMap(fm.mappings) {
+  // come back to this
+}
+
+object FrequencyMap {
+  def apply(mapargs: (Char, Double)*): FrequencyMap = FrequencyMap(mapargs.toMap)
+  def apply(mapargs: Map[Char, Double]): FrequencyMap = new FrequencyMap(mapargs)
+
+  def of(s: String) = {
+    def count[A](m: Map[A, Int], c: A): Map[A, Int]  =
+      m.get(c) match {
+        case Some(count) => m + (c -> (count + 1))
+        case None => m + (c -> 1)
+      }
+
+    FrequencyMap(
+      s
+        .foldLeft(Map[Char, Int]())(count _)
+        .map(tup(_ -> _ / s.length.toDouble))
+    )
+  }
+}