src/utils/FrequencyMap.scala


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

package ixee.cryptopals.utils

import FunctionUtils.tup

class FrequencyMap(mapargs: Map[Char, Double]) {
  sealed trait DiffResult
  object Inconclusive extends DiffResult

  val FittingThreshold: Double = 0.95

  val mappings = mapargs map tup(_.toLower -> _)

  def at(c: Char): Option[Double] =
    if (c.isLower)
      mappings.get(c)
    else                                    // TODO remove the bias here
      mappings.get(c.toLower).map(_ * 0.0002) // bias HARD against uppercase spam. should make this a per-map setting.

  def diff(other: FrequencyMap): Double = {
    // pseudoscience here
    /*
     * Idea...
     *   sum difference squares between other and this
     *   discarding characters that aren't present in other if other's length is < X
     *
     * .. divide by other.totalCount?
    */

    // since this doesn't have sample counts, can't do switching based on other.sampleCount
    // so just do it.

    other.mappings.foldLeft(0.0) { (confidence: Double, next: (Char, Double)) =>
      confidence + tup(diffAt _)(next)
    }
  }

  // TODO: don't hardcode these..
  def diffAt(c: Char, charFreq: Double) =
    Math.pow(c match {
      case ' ' =>
        squared(0.10 - charFreq)
      case '{' | '}' | '`' | '|' | '^' =>
        squared(0.000001 - charFreq) // { } | and ` are very unlikely irl
      case '[' | ']' =>
        squared(0.0000015 - charFreq) // [ ] are more likely
      case '"' | '\'' =>
        squared(0.00001 - charFreq) // " or ' are 100% unlikely 90% of the time
      case '~' | '+' | '=' | '<' | '>' | '/' | '\\' =>
        squared(0.00000175 - charFreq) // math is weird kids
      case ';' | ':' | '-' | '*' | '(' | '&' | ')' | '_' =>
        squared(0.000003 - charFreq) // getting into more common punctuation
      case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' =>
        squared(0.000002 - charFreq) // numbers are KINDA uncommon
      case '$' | '%' | '#' | '@' | '!' =>
        squared(0.00002 - charFreq) // more punctuation...
      case '.' | ',' =>
        squared(0.00007 - charFreq) // and the last of the punctuations
      case '\n' | '\r' =>
        squared(0.000001 - charFreq) // explicit \r \n is rare in freeform text.
      case _ =>
        this.at(c).map(_ - charFreq).map(squared).getOrElse(0.4)
    }, 0.5)

  def squared(x: Double) = x * x

  def likelyFitting(other: FrequencyMap) = 1 - (other diff this) > FittingThreshold

  override def toString = mapargs.toString
}

class SampledFrequencyMap(fm: FrequencyMap, samples: Int) extends FrequencyMap(fm.mappings) {
  // come back to this
}

object FrequencyMap {
  def apply(mapargs: (Char, Double)*): FrequencyMap = FrequencyMap(mapargs.toMap)
  def apply(mapargs: Map[Char, Double]): FrequencyMap = new FrequencyMap(mapargs)

  def of(s: String) = {
    def count[A](m: Map[A, Int], c: A): Map[A, Int]  =
      m.get(c) match {
        case Some(count) => m + (c -> (count + 1))
        case None => m + (c -> 1)
      }

    FrequencyMap(
      s
        .foldLeft(Map[Char, Int]())(count _)
        .map(tup(_ -> _ / s.length.toDouble))
    )
  }
}