*** tokenizer.py Thu Dec 9 17:54:59 2004 --- tokenizer2.py Thu Dec 9 17:44:01 2004 *************** *** 1314,1319 **** --- 1314,1350 ---- elif valid_habeas == 9: yield "x-habeas-swe:valid" + # SPF headers - see http://spf.pobox.com + if options["Tokenizer", "x-tokenize_spf_headers"]: + spf_re = re.compile(r"(\w+)( \(.+\))?(.*)", re.DOTALL) + spf_headers = msg.get_all('Received-SPF', []) + for spf in spf_headers: + mo = spf_re.search(spf) + if mo: + result = mo.group(1) + yield "spf:result:" + result + + # Comments are unlikely to be uniform enough to be of + # use, so do nothing with them for now. We could + # yield the smtp-received-hostname (split on ':'), if + # that would help. + comment = mo.group(2) + key_value_pairs = mo.group(3).split(';') + for key_value_pair in key_value_pairs: + if key_value_pair: + try: + key, value = key_value_pair.split('=', 1) + except ValueError: + yield "spf:badly_formed" + else: + # These may not be any use, or they might + # need to be processed more, e.g. + # 'client-up' could be treated like we + # treat other IP addresses. + yield "spf:%s:%s" % (key, value) + else: + yield "spf:badly_formed" + # Subject: # Don't ignore case in Subject lines; e.g., 'free' versus 'FREE' is # especially significant in this context. Experiment showed a small
*** Options.py Thu Dec 9 17:55:38 2004 --- Options2.py Thu Dec 9 16:34:28 2004 *************** *** 171,176 **** --- 171,186 ---- the ability to reduce the nine tokens to one. (This option has no effect if search_for_habeas_headers is False)""", BOOLEAN, RESTORE), + + ("x-tokenize_spf_headers", "Tokenize SPF Headers", False, + """(EXPERIMENTAL) SPF (Sender Policy Framework) is an attempt to + reduce spam by verifying the sender of messages when they are + received. SPF processors may optionally add a Received-SPF header + to messages that do not fail (failing messages bounce) indicating + why it did not fail. This option generates tokens from any SPF + headers present in a message. See http://spf.pobox.com for more + information about SPF.""", + BOOLEAN, RESTORE), ), # These options are all experimental; it seemed better to put them into
Results with my recent mail (which doesn't include many instances of the header, but does have some) with timcv.py -n 10 (compared to all defaults):
-> <stat> tested 1690 hams & 370 spams against 15232 hams & 3479 spams [etc] filename: spf_bases spfs ham:spam: 16922:3849 16922:3849 fp total: 23 23 fp %: 0.14 0.14 fn total: 105 105 fn %: 2.73 2.73 unsure t: 243 244 unsure %: 1.17 1.17 real cost: $383.60 $383.80 best cost: $331.00 $331.20 h mean: 0.22 0.22 h sdev: 4.13 4.13 s mean: 95.29 95.29 s sdev: 17.65 17.66 mean diff: 95.07 95.07 k: 4.37 4.36
spf_bases.txt -> spfs.txt -> <stat> tested 1690 hams & 370 spams against 15232 hams & 3479 spams [etc] false positive percentages 0.000 0.000 tied 0.119 0.119 tied 0.121 0.121 tied 0.179 0.179 tied 0.115 0.115 tied 0.179 0.179 tied 0.298 0.298 tied 0.121 0.121 tied 0.117 0.117 tied 0.114 0.114 tied won 0 times tied 10 times lost 0 times total unique fp went from 23 to 23 tied mean fp % went from 0.136166422887 to 0.136166422887 tied false negative percentages 3.243 3.243 tied 2.051 2.051 tied 1.995 1.995 tied 3.279 3.279 tied 2.709 2.709 tied 1.458 1.458 tied 1.366 1.366 tied 2.350 2.350 tied 3.846 3.846 tied 4.762 4.762 tied won 0 times tied 10 times lost 0 times total unique fn went from 105 to 105 tied mean fn % went from 2.70593601197 to 2.70593601197 tied ham mean ham sdev 0.08 0.08 +0.00% 1.93 1.93 +0.00% 0.15 0.15 +0.00% 3.47 3.47 +0.00% 0.19 0.19 +0.00% 3.63 3.63 +0.00% 0.26 0.26 +0.00% 4.52 4.52 +0.00% 0.20 0.20 +0.00% 3.92 3.92 +0.00% 0.30 0.30 +0.00% 4.70 4.70 +0.00% 0.36 0.36 +0.00% 5.73 5.73 +0.00% 0.28 0.28 +0.00% 4.50 4.50 +0.00% 0.25 0.25 +0.00% 4.37 4.37 +0.00% 0.16 0.16 +0.00% 3.47 3.47 +0.00% ham mean and sdev for all runs 0.22 0.22 +0.00% 4.13 4.13 +0.00% spam mean spam sdev 94.29 94.29 +0.00% 19.09 19.09 +0.00% 95.66 95.66 +0.00% 16.16 16.16 +0.00% 96.53 96.53 +0.00% 15.36 15.36 +0.00% 95.52 95.52 +0.00% 17.93 17.93 +0.00% 93.94 93.94 +0.00% 19.16 19.16 +0.00% 96.21 96.21 +0.00% 14.49 14.49 +0.00% 96.83 96.83 +0.00% 13.34 13.34 +0.00% 95.73 95.73 +0.00% 16.55 16.55 +0.00% 94.61 94.60 -0.01% 20.06 20.07 +0.05% 93.70 93.70 +0.00% 21.78 21.78 +0.00% spam mean and sdev for all runs 95.29 95.29 +0.00% 17.65 17.66 +0.06% ham/spam mean difference: 95.07 95.07 +0.00
There's barely any change, so -0 on adding an experimental option from me. (Unless mail in the future includes more instances, of course).
If anyone else posts results here and they help, then maybe this can go into 1.1 as a new experimental option.