From aa390776b26d794ab37aa13833fa7043aad16504 Mon Sep 17 00:00:00 2001 From: iximeow Date: Sun, 23 Nov 2014 03:09:27 -0800 Subject: Add in code for challenge 3 --- src/utils/ByteUtils.scala | 56 +++++----------------- src/utils/ConversionUtils.scala | 54 +++++++++++++++++++-- src/utils/Frequencies.scala | 101 ++++++++++++++++++++++++++++++++++++++++ src/utils/FrequencyMap.scala | 91 ++++++++++++++++++++++++++++++++++++ src/utils/TextScorer.scala | 32 +++++++++++++ 5 files changed, 286 insertions(+), 48 deletions(-) create mode 100644 src/utils/Frequencies.scala create mode 100644 src/utils/FrequencyMap.scala create mode 100644 src/utils/TextScorer.scala (limited to 'src/utils') diff --git a/src/utils/ByteUtils.scala b/src/utils/ByteUtils.scala index 85c6058..ffc2e20 100644 --- a/src/utils/ByteUtils.scala +++ b/src/utils/ByteUtils.scala @@ -1,5 +1,7 @@ package ixee.cryptopals.utils +import ConversionUtils.tup + object ByteUtils { trait SizedNumeric[T] { def manifest: Manifest[T] @@ -17,6 +19,7 @@ object ByteUtils { def @<<(t: T, x: Int): T def @|(a: T, b: T): T def @&(a: T, b: T): T + def @^(a: T, b: T): T def @+(a: T, b: T): T } @@ -25,6 +28,7 @@ object ByteUtils { def @<<(t: Byte, x: Int): Byte = (t << x).toByte def @|(a: Byte, b: Byte): Byte = (a | b).toByte def @&(a: Byte, b: Byte): Byte = (a & b).toByte + def @^(a: Byte, b: Byte): Byte = (a ^ b).toByte def @+(a: Byte, b: Byte): Byte = (a + b).toByte } @@ -33,6 +37,7 @@ object ByteUtils { def @<<(t: Int, x: Int): Int = (t << x) def @|(a: Int, b: Int): Int = a | b def @&(a: Int, b: Int): Int = a & b + def @^(a: Int, b: Int): Int = a ^ b def @+(a: Int, b: Int): Int = a + b } @@ -41,11 +46,13 @@ object ByteUtils { def @<<(b: Int): T = implicitly[BitOps[T]].@<<(x, b) def @|(b: T): T = implicitly[BitOps[T]].@|(x, b) def @&(b: T): T = implicitly[BitOps[T]].@&(x, b) + def @^(b: T): T = implicitly[BitOps[T]].@^(x, b) def @+(b: T): T = implicitly[BitOps[T]].@+(x, b) } implicit class SizedWithByteInfo[T : SizedNumeric](x: T) { def byteSize = implicitly[SizedNumeric[T]].byteSize + def octetSize = byteSize * 2 def bitSize = implicitly[SizedNumeric[T]].bitSize def toLong = implicitly[SizedNumeric[T]].toLong(x) def liftedTo[U : SizedNumeric]: U = { @@ -57,36 +64,12 @@ object ByteUtils { uSize.buildFrom(x) } + def hex: String = s"%0${octetSize}x" format x def truncatedTo[U : SizedNumeric]: U = { implicitly[SizedNumeric[U]].truncate(x) } } - implicit class RichIterableByte(iter: Iterable[Byte]) { - def to[T : SizedNumeric : BitOps]: T = { - // Need to know length, so we must force the iter - iter.hasDefiniteSize match { - case true => throw new IllegalArgumentException("Argument is not finite!") - case false => { - val numeric = implicitly[SizedNumeric[T]] - if(b.length < numeric.byteSize) { - throw new RuntimeException("Byte input is not long enough") - } - - var out: Long = b(0) - for(i <- 1 until numeric.byteSize) { - out << 8 - out = b(i) | out - } - numeric.fromLong(out) - } - } - } - - def xor(other: Iterable[Byte]): Iterable[Byte] = - iter.zip(other).map(_ ^@ _) - - } def sizedNumeric[T : Manifest](bytes: Int)(from: Long => T)(to: T => Long) = new SizedNumeric[T] { def manifest = implicitly[Manifest[T]] def byteSize = bytes @@ -99,25 +82,8 @@ object ByteUtils { implicit val shortized = sizedNumeric(2) { x: Long => x.toShort } { _.toLong } implicit val byteSized = sizedNumeric(1) { x: Long => x.toByte } { _.toLong } - def toArrayBuf[T : SizedNumeric : BitOps](x: T): Array[Byte] = { - val buf = new Array[Byte](x.byteSize) - for(i <- 0 until x.byteSize) { - val shiftAmount: Int = (x.byteSize - i - 1) << 3 - buf(i) = (x @>>> shiftAmount).truncatedTo[Byte] - } - buf - } - - def toBinaryString[T : SizedNumeric](x: T)(implicit a: BitOps[T]): String = { - val buf = new StringBuilder(x.bitSize) - for(i <- 0 until x.bitSize) { - val shiftAmount: Int = x.bitSize - i - 1 - buf.append((x @>>> shiftAmount) @& 0x01.liftedTo[T]) - } - buf.toString() - } - - def toHexString[T : SizedNumeric : BitOps](x: Iterable[T]): String = { - + implicit class SeqByteOps(seq: Seq[Byte]) { + def xor(other: Seq[Byte]): Seq[Byte] = + seq.zip(other).map(tup(_ @^ _)) } } diff --git a/src/utils/ConversionUtils.scala b/src/utils/ConversionUtils.scala index 26f2d24..09f3aab 100644 --- a/src/utils/ConversionUtils.scala +++ b/src/utils/ConversionUtils.scala @@ -1,10 +1,10 @@ package ixee.cryptopals.utils -import ByteUtils.WithBitOpts +import ByteUtils._ object ConversionUtils { - def hexStr2Bytes(s: String): Iterator[Byte] = - padToByte(s).grouped(2).map(byteStr2Byte) + def hexStr2Bytes(s: String): Seq[Byte] = + padToByte(s).grouped(2).map(byteStr2Byte).toSeq def byteStr2Byte(str: String): Byte = charSeq2Byte(str.toSeq) @@ -34,5 +34,53 @@ object ConversionUtils { hexStr2Bytes(s).toArray.toBase64 } + + implicit class RichSeqByte(seq: Seq[Byte]) { + def to[T : SizedNumeric : BitOps]: T = { + val numeric = implicitly[SizedNumeric[T]] + if(seq.length < numeric.byteSize) { + throw new RuntimeException("Byte input is not long enough") + } + + var out: Long = seq(0) + for(i <- 1 until numeric.byteSize) { + out << 8 + out = seq(i) | out + } + numeric.fromLong(out) + } + + def hex: String = + seq.map(_.hex).reduceLeft(_ + _) + + def asAscii: String = + new String(seq.toArray) + } + + implicit class RichStringBytes(s: String) { + def asBytes = s.toSeq.map(_.toByte) + } + + // Because doing (_ f _).tupled confuses the inferencer... + def tup[A, B, C](f: (A, B) => C): ((A, B)) => C = f.tupled + + // TODO: tailrec this, get rid of for + def toByteSeq[T : SizedNumeric : BitOps](x: T): Seq[Byte] = { + val buf = new Array[Byte](x.byteSize) + for(i <- 0 until x.byteSize) { + val shiftAmount = (x.byteSize - i - 1) << 3 + buf(i) = (x @>>> shiftAmount).truncatedTo[Byte] + } + buf.toSeq + } + + def toBinaryString[T : SizedNumeric](x: T)(implicit a: BitOps[T]): String = { + val buf = new StringBuilder(x.bitSize) + for(i <- 0 until x.bitSize) { + val shiftAmount: Int = x.bitSize - i - 1 + buf.append((x @>>> shiftAmount) @& 0x01.liftedTo[T]) + } + buf.toString() + } } diff --git a/src/utils/Frequencies.scala b/src/utils/Frequencies.scala new file mode 100644 index 0000000..63d74ac --- /dev/null +++ b/src/utils/Frequencies.scala @@ -0,0 +1,101 @@ +package ixee.cryptopals.utils + +object Frequencies { + lazy val frequencies = Seq( + cryptologicalMathematics, + cornell40kSample + ) + + // http://www.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.html + val cornell40kSample = FrequencyMap( + 'e' -> 0.1202, + 't' -> 0.0910, + 'a' -> 0.0812, + 'o' -> 0.0768, + 'i' -> 0.0731, + 'n' -> 0.0695, + 's' -> 0.0628, + 'r' -> 0.0602, + 'h' -> 0.0592, + 'd' -> 0.0432, + 'l' -> 0.0398, + 'u' -> 0.0288, + 'c' -> 0.0271, + 'm' -> 0.0261, + 'f' -> 0.0230, + 'y' -> 0.0211, + 'w' -> 0.0209, + 'g' -> 0.0203, + 'p' -> 0.0182, + 'b' -> 0.0149, + 'v' -> 0.0111, + 'k' -> 0.0069, + 'x' -> 0.0017, + 'q' -> 0.0011, + 'j' -> 0.0010, + 'z' -> 0.0007 + ) + + // from http://en.algoritmy.net/article/40379/Letter-frequency-English + val cryptologicalMathematics = FrequencyMap( + 'a' -> 0.08167, + 'b' -> 0.01492, + 'c' -> 0.02782, + 'd' -> 0.04253, + 'e' -> 0.12702, + 'f' -> 0.02228, + 'g' -> 0.02015, + 'h' -> 0.06094, + 'i' -> 0.06966, + 'j' -> 0.00153, + 'k' -> 0.00772, + 'l' -> 0.04025, + 'm' -> 0.02406, + 'n' -> 0.06749, + 'o' -> 0.07507, + 'p' -> 0.01929, + 'q' -> 0.00095, + 'r' -> 0.05987, + 's' -> 0.06327, + 't' -> 0.09056, + 'u' -> 0.02758, + 'v' -> 0.00978, + 'w' -> 0.02360, + 'x' -> 0.00150, + 'y' -> 0.01974, + 'z' -> 0.00074 + ) + + // http://en.wikipedia.org/wiki/Letter_frequency#cite_note-13 + // Calculated from "Project Gutenberg Selections" available from the NLTK Corpora + // http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml + // has since gone dead. + val firstLetterOfWord = FrequencyMap( + 'a' -> 0.11602, + 'b' -> 0.04702, + 'c' -> 0.03511, + 'd' -> 0.02670, + 'e' -> 0.02007, + 'f' -> 0.03779, + 'g' -> 0.01950, + 'h' -> 0.07232, + 'i' -> 0.06286, + 'j' -> 0.00597, + 'k' -> 0.00590, + 'l' -> 0.02705, + 'm' -> 0.04374, + 'n' -> 0.02365, + 'o' -> 0.06264, + 'p' -> 0.02545, + 'q' -> 0.00173, + 'r' -> 0.01653, + 's' -> 0.07755, + 't' -> 0.16671, + 'u' -> 0.01487, + 'v' -> 0.00649, + 'w' -> 0.06753, + 'x' -> 0.00017, + 'y' -> 0.01620, + 'z' -> 0.00034 + ) +} diff --git a/src/utils/FrequencyMap.scala b/src/utils/FrequencyMap.scala new file mode 100644 index 0000000..0bea25c --- /dev/null +++ b/src/utils/FrequencyMap.scala @@ -0,0 +1,91 @@ +package ixee.cryptopals.utils + +import ConversionUtils.tup + +class FrequencyMap(mapargs: Map[Char, Double]) { + sealed trait DiffResult + object Inconclusive extends DiffResult + + val FittingThreshold: Double = 0.95 + + val mappings = mapargs map tup(_.toLower -> _) + + def at(c: Char): Option[Double] = + if (c.isLower) + mappings.get(c) + else // TODO remove the bias here + mappings.get(c.toLower).map(_ * 0.002) // bias HARD against uppercase spam. should make this a per-map setting. + + def diff(other: FrequencyMap): Double = { + // pseudoscience here + /* + * Idea... + * sum difference squares between other and this + * discarding characters that aren't present in other if other's length is < X + * + * .. divide by other.totalCount? + */ + + // since this doesn't have sample counts, can't do switching based on other.sampleCount + // so just do it. + + other.mappings.foldLeft(0.0) { (confidence: Double, next: (Char, Double)) => + confidence + tup(diffAt _)(next) + } + } + + // TODO: don't hardcode these.. + def diffAt(c: Char, charFreq: Double) = + c match { + case ' ' => 0 + case '{' | '}' | '`' | '|' => + squared(0.000001 - charFreq) // { } | and ` are very unlikely irl + case '[' | ']' => + squared(0.000002 - charFreq) // [ ] are more likely + case '"' | '\''=> + squared(0.00001 - charFreq) // " or ' are 100% unlikely 90% of the time + case '~' | '+' | '=' | '<' | '>' | '/' | '\\' => + squared(0.000004 - charFreq) // math is weird kids + case ';' | ':' | '-' | '*' | '(' | '&' | ')' | '_' => + squared(0.000009 - charFreq) // getting into more common punctuation + case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => + squared(0.00001 - charFreq) // numbers are KINDA uncommon + case '$' | '%' | '#' | '@' | '!' => + squared(0.00003 - charFreq) // more punctuation... + case '.' | ',' => + squared(0.00007 - charFreq) // and the last of the punctuations + case '\n' | '\r' => + squared(0.0000002 - charFreq) // explicit \r \n is rare in freeform text. + case _ => + this.at(c).map(_ - charFreq).map(squared).getOrElse(0.0) + } + + def squared(x: Double) = x * x + + def likelyFitting(other: FrequencyMap) = 1 - (other diff this) > FittingThreshold + + override def toString = mapargs.toString +} + +class SampledFrequencyMap(fm: FrequencyMap, samples: Int) extends FrequencyMap(fm.mappings) { + // come back to this +} + +object FrequencyMap { + def apply(mapargs: (Char, Double)*): FrequencyMap = FrequencyMap(mapargs.toMap) + def apply(mapargs: Map[Char, Double]): FrequencyMap = new FrequencyMap(mapargs) + + def of(s: String) = { + def count[A](m: Map[A, Int], c: A): Map[A, Int] = + m.get(c) match { + case Some(count) => m + (c -> (count + 1)) + case None => m + (c -> 1) + } + + FrequencyMap( + s + .foldLeft(Map[Char, Int]())(count _) + .map(tup(_ -> _ / s.length.toDouble)) + ) + } +} diff --git a/src/utils/TextScorer.scala b/src/utils/TextScorer.scala new file mode 100644 index 0000000..b02c250 --- /dev/null +++ b/src/utils/TextScorer.scala @@ -0,0 +1,32 @@ +package ixee.cryptopals.utils + +import ByteUtils._ + +object TextScorer { + val controlChars: Seq[Byte] = Seq( + 0x08, 0x09, 0x0a, 0x0b, 0x0d + ).map(_.toByte) + + def isText(s: Iterable[Byte]): Boolean = + s.forall(_ @& 0x80.toByte == 0) && + s.filter(_ < 0x20).forall(controlChars.contains _) + + def score(s: Seq[Byte]): Double = + if (!isText(s)) + -1 // not English text! + else + scoreBy(new String(s.toArray), Frequencies.cornell40kSample) + //score s + + def scoreBy(s: String, fm: FrequencyMap): Double = { + val sfm = FrequencyMap.of(s) + fm diff sfm + } + + def looksEnglish(s: String)(implicit fm: FrequencyMap): Boolean = + if (!isText(s.toCharArray.map(_.toByte))) + false + else + FrequencyMap.of(s).likelyFitting(fm) +} + -- cgit v1.1