summaryrefslogtreecommitdiff
path: root/src/utils/FrequencyMap.scala
diff options
context:
space:
mode:
Diffstat (limited to 'src/utils/FrequencyMap.scala')
-rw-r--r--src/utils/FrequencyMap.scala91
1 files changed, 91 insertions, 0 deletions
diff --git a/src/utils/FrequencyMap.scala b/src/utils/FrequencyMap.scala
new file mode 100644
index 0000000..0bea25c
--- /dev/null
+++ b/src/utils/FrequencyMap.scala
@@ -0,0 +1,91 @@
+package ixee.cryptopals.utils
+
+import ConversionUtils.tup
+
+class FrequencyMap(mapargs: Map[Char, Double]) {
+ sealed trait DiffResult
+ object Inconclusive extends DiffResult
+
+ val FittingThreshold: Double = 0.95
+
+ val mappings = mapargs map tup(_.toLower -> _)
+
+ def at(c: Char): Option[Double] =
+ if (c.isLower)
+ mappings.get(c)
+ else // TODO remove the bias here
+ mappings.get(c.toLower).map(_ * 0.002) // bias HARD against uppercase spam. should make this a per-map setting.
+
+ def diff(other: FrequencyMap): Double = {
+ // pseudoscience here
+ /*
+ * Idea...
+ * sum difference squares between other and this
+ * discarding characters that aren't present in other if other's length is < X
+ *
+ * .. divide by other.totalCount?
+ */
+
+ // since this doesn't have sample counts, can't do switching based on other.sampleCount
+ // so just do it.
+
+ other.mappings.foldLeft(0.0) { (confidence: Double, next: (Char, Double)) =>
+ confidence + tup(diffAt _)(next)
+ }
+ }
+
+ // TODO: don't hardcode these..
+ def diffAt(c: Char, charFreq: Double) =
+ c match {
+ case ' ' => 0
+ case '{' | '}' | '`' | '|' =>
+ squared(0.000001 - charFreq) // { } | and ` are very unlikely irl
+ case '[' | ']' =>
+ squared(0.000002 - charFreq) // [ ] are more likely
+ case '"' | '\''=>
+ squared(0.00001 - charFreq) // " or ' are 100% unlikely 90% of the time
+ case '~' | '+' | '=' | '<' | '>' | '/' | '\\' =>
+ squared(0.000004 - charFreq) // math is weird kids
+ case ';' | ':' | '-' | '*' | '(' | '&' | ')' | '_' =>
+ squared(0.000009 - charFreq) // getting into more common punctuation
+ case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' =>
+ squared(0.00001 - charFreq) // numbers are KINDA uncommon
+ case '$' | '%' | '#' | '@' | '!' =>
+ squared(0.00003 - charFreq) // more punctuation...
+ case '.' | ',' =>
+ squared(0.00007 - charFreq) // and the last of the punctuations
+ case '\n' | '\r' =>
+ squared(0.0000002 - charFreq) // explicit \r \n is rare in freeform text.
+ case _ =>
+ this.at(c).map(_ - charFreq).map(squared).getOrElse(0.0)
+ }
+
+ def squared(x: Double) = x * x
+
+ def likelyFitting(other: FrequencyMap) = 1 - (other diff this) > FittingThreshold
+
+ override def toString = mapargs.toString
+}
+
+class SampledFrequencyMap(fm: FrequencyMap, samples: Int) extends FrequencyMap(fm.mappings) {
+ // come back to this
+}
+
+object FrequencyMap {
+ def apply(mapargs: (Char, Double)*): FrequencyMap = FrequencyMap(mapargs.toMap)
+ def apply(mapargs: Map[Char, Double]): FrequencyMap = new FrequencyMap(mapargs)
+
+ def of(s: String) = {
+ def count[A](m: Map[A, Int], c: A): Map[A, Int] =
+ m.get(c) match {
+ case Some(count) => m + (c -> (count + 1))
+ case None => m + (c -> 1)
+ }
+
+ FrequencyMap(
+ s
+ .foldLeft(Map[Char, Int]())(count _)
+ .map(tup(_ -> _ / s.length.toDouble))
+ )
+ }
+}