From becc3d5b4ea19c7d50f29f2a8ce3ec8baf7cc585 Mon Sep 17 00:00:00 2001 From: Karsten Loesing Date: Sat, 8 Feb 2014 11:52:33 +0100 Subject: Remove script to clean up GeoLite country databases. --- src/config/README.geoip | 90 -------------------- src/config/deanonymind.py | 205 ---------------------------------------------- src/config/geoip-manual | 80 ------------------ 3 files changed, 375 deletions(-) delete mode 100644 src/config/README.geoip delete mode 100755 src/config/deanonymind.py delete mode 100644 src/config/geoip-manual diff --git a/src/config/README.geoip b/src/config/README.geoip deleted file mode 100644 index 852050140..000000000 --- a/src/config/README.geoip +++ /dev/null @@ -1,90 +0,0 @@ -README.geoip -- information on the IP-to-country-code file shipped with tor -=========================================================================== - -The IP-to-country-code file in src/config/geoip is based on MaxMind's -GeoLite Country database with the following modifications: - - - Those "A1" ("Anonymous Proxy") entries lying inbetween two entries with - the same country code are automatically changed to that country code. - These changes can be overriden by specifying a different country code - in src/config/geoip-manual. - - - Other "A1" entries are replaced with country codes specified in - src/config/geoip-manual, or are left as is if there is no corresponding - entry in that file. Even non-"A1" entries can be modified by adding a - replacement entry to src/config/geoip-manual. Handle with care. - - -1. Updating the geoip file from a MaxMind database file -------------------------------------------------------- - -Download the most recent MaxMind GeoLite Country database: -http://geolite.maxmind.com/download/geoip/database/GeoIPCountryCSV.zip - -Run `python deanonymind.py` in the local directory. Review the output to -learn about applied automatic/manual changes and watch out for any -warnings. - -Possibly edit geoip-manual to make more/fewer/different manual changes and -re-run `python deanonymind.py`. - -When done, prepend the new geoip file with a comment like this: - - # Last updated based on $DATE Maxmind GeoLite Country - # See README.geoip for details on the conversion. - - -2. Verifying automatic and manual changes using diff ----------------------------------------------------- - -To unzip the original MaxMind file and look at the automatic changes, run: - - unzip GeoIPCountryCSV.zip - diff -U1 GeoIPCountryWhois.csv AutomaticGeoIPCountryWhois.csv - -To look at subsequent manual changes, run: - - diff -U1 AutomaticGeoIPCountryWhois.csv ManualGeoIPCountryWhois.csv - -To manually generate the geoip file and compare it to the automatically -created one, run: - - cut -d, -f3-5 < ManualGeoIPCountryWhois.csv | sed 's/"//g' > mygeoip - diff -U1 geoip mygeoip - - -3. Verifying automatic and manual changes using blockfinder ------------------------------------------------------------ - -Blockfinder is a powerful tool to handle multiple IP-to-country data -sources. Blockfinder has a function to specify a country code and compare -conflicting country code assignments in different data sources. - -We can use blockfinder to compare A1 entries in the original MaxMind file -with the same or overlapping blocks in the file generated above and in the -RIR delegation files: - - git clone https://github.com/ioerror/blockfinder - cd blockfinder/ - python blockfinder -i - python blockfinder -r ../GeoIPCountryWhois.csv - python blockfinder -r ../ManualGeoIPCountryWhois.csv - python blockfinder -p A1 > A1-comparison.txt - -The output marks conflicts between assignments using either '*' in case of -two different opinions or '#' for three or more different opinions about -the country code for a given block. - -The '*' conflicts are most likely harmless, because there will always be -at least two opinions with the original MaxMind file saying A1 and the -other two sources saying something more meaningful. - -However, watch out for '#' conflicts. In these cases, the original -MaxMind file ("A1"), the updated MaxMind file (hopefully the correct -country code), and the RIR delegation files (some other country code) all -disagree. - -There are perfectly valid cases where the updated MaxMind file and the RIR -delegation files don't agree. But each of those cases must be verified -manually. - diff --git a/src/config/deanonymind.py b/src/config/deanonymind.py deleted file mode 100755 index 31d0658ee..000000000 --- a/src/config/deanonymind.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python -import optparse -import os -import sys -import zipfile - -""" -Take a MaxMind GeoLite Country database as input and replace A1 entries -with the country code and name of the preceding entry iff the preceding -(subsequent) entry ends (starts) directly before (after) the A1 entry and -both preceding and subsequent entries contain the same country code. - -Then apply manual changes, either replacing A1 entries that could not be -replaced automatically or overriding previously made automatic changes. -""" - -def main(): - options = parse_options() - assignments = read_file(options.in_maxmind) - assignments = apply_automatic_changes(assignments) - write_file(options.out_automatic, assignments) - manual_assignments = read_file(options.in_manual, must_exist=False) - assignments = apply_manual_changes(assignments, manual_assignments) - write_file(options.out_manual, assignments) - write_file(options.out_geoip, assignments, long_format=False) - -def parse_options(): - parser = optparse.OptionParser() - parser.add_option('-i', action='store', dest='in_maxmind', - default='GeoIPCountryCSV.zip', metavar='FILE', - help='use the specified MaxMind GeoLite Country .zip or .csv ' - 'file as input [default: %default]') - parser.add_option('-g', action='store', dest='in_manual', - default='geoip-manual', metavar='FILE', - help='use the specified .csv file for manual changes or to ' - 'override automatic changes [default: %default]') - parser.add_option('-a', action='store', dest='out_automatic', - default="AutomaticGeoIPCountryWhois.csv", metavar='FILE', - help='write full input file plus automatic changes to the ' - 'specified .csv file [default: %default]') - parser.add_option('-m', action='store', dest='out_manual', - default='ManualGeoIPCountryWhois.csv', metavar='FILE', - help='write full input file plus automatic and manual ' - 'changes to the specified .csv file [default: %default]') - parser.add_option('-o', action='store', dest='out_geoip', - default='geoip', metavar='FILE', - help='write full input file plus automatic and manual ' - 'changes to the specified .csv file that can be shipped ' - 'with tor [default: %default]') - (options, args) = parser.parse_args() - return options - -def read_file(path, must_exist=True): - if not os.path.exists(path): - if must_exist: - print 'File %s does not exist. Exiting.' % (path, ) - sys.exit(1) - else: - return - if path.endswith('.zip'): - zip_file = zipfile.ZipFile(path) - csv_content = zip_file.read('GeoIPCountryWhois.csv') - zip_file.close() - else: - csv_file = open(path) - csv_content = csv_file.read() - csv_file.close() - assignments = [] - for line in csv_content.split('\n'): - stripped_line = line.strip() - if len(stripped_line) > 0 and not stripped_line.startswith('#'): - assignments.append(stripped_line) - return assignments - -def apply_automatic_changes(assignments): - print '\nApplying automatic changes...' - result_lines = [] - prev_line = None - a1_lines = [] - for line in assignments: - if '"A1"' in line: - a1_lines.append(line) - else: - if len(a1_lines) > 0: - new_a1_lines = process_a1_lines(prev_line, a1_lines, line) - for new_a1_line in new_a1_lines: - result_lines.append(new_a1_line) - a1_lines = [] - result_lines.append(line) - prev_line = line - if len(a1_lines) > 0: - new_a1_lines = process_a1_lines(prev_line, a1_lines, None) - for new_a1_line in new_a1_lines: - result_lines.append(new_a1_line) - return result_lines - -def process_a1_lines(prev_line, a1_lines, next_line): - if not prev_line or not next_line: - return a1_lines # Can't merge first or last line in file. - if len(a1_lines) > 1: - return a1_lines # Can't merge more than 1 line at once. - a1_line = a1_lines[0].strip() - prev_entry = parse_line(prev_line) - a1_entry = parse_line(a1_line) - next_entry = parse_line(next_line) - touches_prev_entry = int(prev_entry['end_num']) + 1 == \ - int(a1_entry['start_num']) - touches_next_entry = int(a1_entry['end_num']) + 1 == \ - int(next_entry['start_num']) - same_country_code = prev_entry['country_code'] == \ - next_entry['country_code'] - if touches_prev_entry and touches_next_entry and same_country_code: - new_line = format_line_with_other_country(a1_entry, prev_entry) - print '-%s\n+%s' % (a1_line, new_line, ) - return [new_line] - else: - return a1_lines - -def parse_line(line): - if not line: - return None - keys = ['start_str', 'end_str', 'start_num', 'end_num', - 'country_code', 'country_name'] - stripped_line = line.replace('"', '').strip() - parts = stripped_line.split(',') - entry = dict((k, v) for k, v in zip(keys, parts)) - return entry - -def format_line_with_other_country(original_entry, other_entry): - return '"%s","%s","%s","%s","%s","%s"' % (original_entry['start_str'], - original_entry['end_str'], original_entry['start_num'], - original_entry['end_num'], other_entry['country_code'], - other_entry['country_name'], ) - -def apply_manual_changes(assignments, manual_assignments): - if not manual_assignments: - return assignments - print '\nApplying manual changes...' - manual_dict = {} - for line in manual_assignments: - start_num = parse_line(line)['start_num'] - if start_num in manual_dict: - print ('Warning: duplicate start number in manual ' - 'assignments:\n %s\n %s\nDiscarding first entry.' % - (manual_dict[start_num], line, )) - manual_dict[start_num] = line - result = [] - for line in assignments: - entry = parse_line(line) - start_num = entry['start_num'] - if start_num in manual_dict: - manual_line = manual_dict[start_num] - manual_entry = parse_line(manual_line) - if entry['start_str'] == manual_entry['start_str'] and \ - entry['end_str'] == manual_entry['end_str'] and \ - entry['end_num'] == manual_entry['end_num']: - if len(manual_entry['country_code']) != 2: - print '-%s' % (line, ) # only remove, don't replace - del manual_dict[start_num] - elif entry['country_code'] != \ - manual_entry['country_code']: - new_line = format_line_with_other_country(entry, - manual_entry) - print '-%s\n+%s' % (line, new_line, ) - result.append(new_line) - del manual_dict[start_num] - else: - print ('Warning: not applying ineffective manual ' - 'change:\n %s\n %s' % (line, manual_line, )) - result.append(line) - else: - print ('Warning: not applying manual change that is only ' - 'a partial match:\n %s\n %s' % - (line, manual_line, )) - result.append(line) - elif 'country_code' in entry and \ - entry['country_code'] == 'A1': - print ('Warning: no manual replacement for A1 entry:\n %s' - % (line, )) - result.append(line) - else: - result.append(line) - if len(manual_dict) > 0: - print 'Warning: could not apply all manual assignments:' - for line in manual_dict.values(): - print ' %s' % (line, ) - return result - -def write_file(path, assignments, long_format=True): - if long_format: - output_lines = assignments - else: - output_lines = [] - for long_line in assignments: - entry = parse_line(long_line) - short_line = "%s,%s,%s" % (entry['start_num'], - entry['end_num'], entry['country_code'], ) - output_lines.append(short_line) - out_file = open(path, 'w') - out_file.write('\n'.join(output_lines)) - out_file.close() - -if __name__ == '__main__': - main() - diff --git a/src/config/geoip-manual b/src/config/geoip-manual deleted file mode 100644 index d51a12a1c..000000000 --- a/src/config/geoip-manual +++ /dev/null @@ -1,80 +0,0 @@ -# This file contains manual overrides of A1 entries (and possibly others) -# in MaxMind's GeoLite Country database. Use deanonymind.py in the same -# directory to process this file when producing a new geoip file. See -# README.geoip in the same directory for details. - -# GB, because RIR delegation files say exactly this range -# 46.16.32.0-46.16.39.255 is GB, even though neither previous nor next -# MaxMind range is GB. Both previous and next MaxMind ranges match RIR -# delegation files, too. -KL 2013-03-07 -"46.16.32.0","46.16.39.255","772808704","772810751","GB","United Kingdom" - -# CH, because previous MaxMind entry 46.19.141.0-46.19.142.255 is CH, and -# RIR delegation files say 46.19.136.0-46.19.143.255 is CH. -# -KL 2012-11-27 -"46.19.143.0","46.19.143.255","773033728","773033983","CH","Switzerland" - -# GB, because next MaxMind entry 46.166.129.0-46.166.134.255 is GB, and -# RIR delegation files say 46.166.128.0-46.166.191.255 is GB. -# -KL 2012-11-27 -"46.166.128.0","46.166.128.255","782663680","782663935","GB","United Kingdom" - -# US, because previous MaxMind entry 70.159.21.51-70.232.244.255 is US, -# because next MaxMind entry 70.232.245.58-70.232.245.59 is A2 ("Satellite -# Provider") which is a country information about as useless as A1, and -# because RIR delegation files say 70.224.0.0-70.239.255.255 is US. -# -KL 2012-11-27 -"70.232.245.0","70.232.245.57","1189672192","1189672249","US","United States" - -# US, because next MaxMind entry 70.232.246.0-70.240.141.255 is US, -# because previous MaxMind entry 70.232.245.58-70.232.245.59 is A2 -# ("Satellite Provider") which is a country information about as useless -# as A1, and because RIR delegation files say 70.224.0.0-70.239.255.255 is -# US. -KL 2012-11-27 -"70.232.245.60","70.232.245.255","1189672252","1189672447","US","United States" - -# GB, despite neither previous (GE) nor next (LV) MaxMind entry being GB, -# but because RIR delegation files agree with both previous and next -# MaxMind entry and say GB for 91.228.0.0-91.228.3.255. -KL 2012-11-27 -"91.228.0.0","91.228.3.255","1541668864","1541669887","GB","United Kingdom" - -# NL, because next MaxMind entry 176.56.173.0-176.56.173.63 is NL, and RIR -# delegation files say 176.56.160.0-176.56.191.255 is NL. -KL 2013-05-13 -"176.56.172.0","176.56.172.255","2956504064","2956504319","NL","Netherlands" - -# NL, despite neither previous (RU) nor next (GB) MaxMind entry being NL, -# but because RIR delegation files say entire range -# 176.56.160.0-176.56.191.255 is NL. -KL 2013-05-13 -"176.56.174.0","176.56.174.255","2956504576","2956504831","NL","Netherlands" - -# GB, because RIR delegation files say exactly this range -# 185.25.84.0-185.25.87.255 is GB, even though neither previous nor next -# MaxMind range is GB. Both previous and next MaxMind ranges match RIR -# delegation files, too. -KL 2013-05-13 -"185.25.84.0","185.25.87.255","3105444864","3105445887","GB","United Kingdom" - -# US, because next MaxMind entry 199.101.193.0-199.101.195.255 is US, and, -# together with next entries, matches RIR delegation file entry -# 199.101.192.0-199.101.199.255 which is US. -KL 2013-05-13 -"199.101.192.0","199.101.192.255","3345334272","3345334527","US","United States" - -# US, because ARIN says 199.255.208.0-199.255.215.255 is US. -# Changed entry start from 199.255.213.0 to 199.255.208.0 on 2013-08-12. -# Split up into 199.255.208.0-199.255.209.127 and -# 199.255.210.0-199.255.215.255 on 2013-10-11. -KL 2013-10-11 -"199.255.208.0","199.255.209.127","3355430912","3355431295","US","United States" -"199.255.210.0","199.255.215.255","3355431424","3355432959","US","United States" - -# EU, despite neither previous (RU) nor next (SE) MaxMind entry being EU, -# but because RIR delegation files agree with previous MaxMind entry and -# say EU for 217.15.160.0-217.15.175.255. -KL 2013-05-13 -"217.15.160.0","217.15.164.255","3641679872","3641681151","EU","Europe" - -# FR, because previous MaxMind entry 217.15.166.0-217.15.166.255 is FR, -# and RIR delegation files contain a block 217.15.160.0-217.15.175.255 -# which, however, is EU, not FR. But merging with next MaxMind entry -# 217.15.176.0-217.15.191.255 which is KZ and which fully matches what -# the RIR delegation files say seems unlikely to be correct. -# -KL 2012-11-27 -"217.15.167.0","217.15.175.255","3641681664","3641683967","FR","France" - -- cgit v1.2.3