{ A Plan for Spam

A Plan for Spam

This is a Jscheme implementation of Paul Graham's "A Plan for Spam" by kanderson@bbn.com. see http://openmap.bbn.com/~kanderso/plan

Requirements

Java hint: This version reads in all the email into memory, you can run the example code with:

  java  -Xms50M -Xmx300M -jar lib/jscheme.jar plan.scm example.scm
EMACS hint: M-X toggle-truncate-lines is worth turning on because it will make printing long lists take less time.
}
(load "elf/basic.scm")
(load "elf/sort.scm")
(load "dclass/record.scm")

(import "java.io.FileReader")
(import "java.io.BufferedReader")
(import "java.util.regex.Pattern")
(import "java.util.HashSet")
{

Utilities

}
(define (take n xs)
  ;; Return a list of the first n objects off the list xs.
  (define (take n xs sofar)
    (if (= n 0) (reverse sofar)
	(if (null? xs) (reverse sofar)
	    (take (- n 1) (cdr xs) (cons (car xs) sofar)))))
  (take n xs '()))

(define (ifNull x default) (if (isNull x) default x))

(define (// pattern)
  ;; Return a procedure that matches pattern.
  (let ((p (Pattern.compile pattern)))
    (lambda (string)
      (.find (.matcher p string)))))

(define startsWhite? (// "^[ \t]"))	; Starts with white space.

(define (BufferedFileReader f) (BufferedReader. (FileReader. f)))
{

Constants

These are the same values that Paul uses.
}
;;; Probability to use when (count w) < MIN_COUNT.
(define DEFAULT_PROBABILITY 0.4)
;;; If a word has less than MIN_COUNT examples us DEFAULT_PROBABILITY as
;;; its wordProb.
(define MIN_COUNT 5)
;;; Use the N_MOST_DEVIANT most deviant words to compute spam probability.
(define N_MOST_DEVIANT 15)
{

Class Word

A Word object is used to accumulate statistics about each word.
}
(define-record plan.Word
  (fields
   (name String)
   (nSpam 0)
   (nHam 0)
   (probability -1.0)
   (deviance -1.0)))

(import "plan.Word")
(define (nSpam+ w) (.nSpam$ w (+ (.nSpam$ w) 1)))
(define (nHam+  w) (.nHam$  w (+ (.nHam$  w) 1)))
(define (count w) (+ (.nSpam$ w) (.nHam$ w)))
{

Class Filter

A Filter is a collection of Words that can be used to filter spam.

}
(define-record plan.Filter
  (imports java.util.HashMap)
  (fields
   (nSpam 0)
   (nHam 0)
   (words HashMap (HashMap. 2000))))

(define (getWord filter w)
  (let* ((words (.words$ filter))
	 (it (.get words w)))
    (if (isNull it)
	(let ((it (Word. w)))
	  (.put words w it)
	  it)
	it)))

(define (learn filter spams hams)
  ;; Learn on a lists of spam and ham.
  (for-each (lambda (m) (merge filter m 'spam)) spams)
  (for-each (lambda (m) (merge filter m 'ham)) hams)
  ;; Cache the probability and deviance.
  (for-each* (lambda (w)
	       (.probability$ w (spamProb filter w))
	       (.deviance$ w (abs (- (.probability$ w) 0.5))))
	     (.words$ filter)))

(define (merge filter message class)
  ;; Merge statistics from a message. class is either 'spam or 'ham.
  (if (eq? class 'spam)
      (begin (tokenMerge filter message nSpam+)
	     (nSpam+ filter))
      (begin (tokenMerge filter message nHam+)
	     (nHam+ filter))))

(define (tokenMerge filter message increment)
  (iterate (tokenize message)
	   (lambda (word) (increment (getWord filter word)))))

(define (spamProb filter w)
  ;; Probability that a word is from a Spam.
  (let ((g (exact->inexact (* 2 (.nHam$ w))))
	(b (exact->inexact (.nSpam$ w))))
    (if (< (+ g b) MIN_COUNT) DEFAULT_PROBABILITY
	(max 0.001 (min 0.999 (/ (/ b (.nSpam$ filter))
				 (+ (/ g (.nHam$ filter))
				    (/ b (.nSpam$ filter)))))))))

(define (topWords filter message)
  ;;  Most extreme words.
  (take N_MOST_DEVIANT
	(sort (map* (lambda (w) (getWord filter w)) (tokenize message))
	      (comparator > .deviance$))))

(define (spammyness filter message)
  ;; Probability that message is spam.
  (let* ((probs (map (lambda (w) (spamProb filter w))
		     (topWords filter message)))
	 (prod (apply * probs)))
    (/ prod (+ prod (apply * (map (lambda (p) (- 1.0 p)) probs))))))

(define (spamReport filter message)
  ;; Display message with top words and their probability
  (display {--- spam report ---\n})
  (display message)
  (newline)
  (for-each (lambda (w) (display {[w]\n}))
	    (topWords filter message))
  (display {[(spammyness filter message)]\n}))

(define (showWords filter)
  ;; Show each word and probability.
  ;; Don't let spamers see this!
  (let ((fields (list .name$ .nSpam$ .nHam$ .probability)))
    (for-each (lambda (w) (print (map (lambda (f) (f w)) fields)))
	      (sort (filter (lambda (w) (> (count w) MIN_COUNT))
			    (.words$ filter))
		    (comparator > wordProb)))))
{

Messages

Messages are read in from a file where a new message begins with a line that can be matched by messageSeparator?

}
;;; Pick a message separator for your mail
(define messageSeparator? (// "^From "))
{
Ignore headers that may indicate spam directly, such as headers added by spam-assassin. We assume that spam headers are on one line.
}
(define ignoreHeader? (// {[(separate
			     "|"
			     (map (lambda (h) {^[h]: })
				  '(X-Spam-Status
				    X-Spam-Score
				    X-Spam-Report
				    X-Spam-Prev-Content-Type
				    X-Spam-Prev-Content-Transfer-Encoding
				    X-Spam-Level
				    X-Spam-Flag
				    X-Spam-Checker-Version
				    X-Scanned-By
				    Old-X-Spam-Status
				    Old-X-Scanned-By
				    SPAM)))]}))

(define (readMessage reader)
  ;; Read message into a String, or #f.
  (define (!) (.readLine reader))
  (define (start it sb)
    (if (or (isNull it) (messageSeparator? it))
	(let ((it (.toString sb)))
	  (if (.equals it "") #f
	      it))
	(if (ignoreHeader? it) (trimHeader (!) sb)
	    (start (!) (begin (.append sb it)
				     (.append sb "\n"))))))
  (define (trimHeader it sb)
    (if (startsWhite? it) (trimHeader (!) sb)
	(start it sb)))
  (start (!) (StringBuffer.)))

(define (messageStream file)
  ;; Returns a thunk which is a stream of messages.
  ;; Each call to thunk returns a message or #f when stream is empty.
  (let ((reader(BufferedFileReader file)))
    (.read reader)			; Skip first separator.
    (lambda () (readMessage reader))))

(define (readMessages file)
  (let ((s (messageStream file)))
    (define (readMessages sofar)
      (let ((it (s)))
	(if it (readMessages (cons it sofar))
	    (reverse sofar))))
    (readMessages '())))
    
(define tokenize
  ;; Return a HashSet of tokens in string.
  (let ((p (Pattern.compile "([-'$a-zA-Z0-9]+)")))
    (lambda (string)
      (let ((m (.matcher p string))
	    (set (HashSet.)))
	(let loop ()
	  (if (.find m)
	      (begin (.add set (.toLowerCase (.group m 1)))
		     (loop))
	      set))))))

;;; scp index.html plan.html plan.scm example.scm style.css kanderso@openmap:.public_html/plan
;;; http://java.sun.com/products/javamail/
;;; http://prdownloads.sourceforge.net/iharder/base64-1.3.6.zip
{
 
}