diff options
author | Jochen Topf <jochen@topf.org> | 2014-05-12 11:17:23 +0200 |
---|---|---|
committer | Jochen Topf <jochen@topf.org> | 2014-05-12 11:17:23 +0200 |
commit | b46b21e3437a8ea04429995c038b64d654eaa48e (patch) | |
tree | dbf4d3e5be7b1d745f8b4527930e0314b2b4d876 | |
parent | 5407efff508b5428f30ea28f113bf370815dfabe (diff) | |
download | taginfo-b46b21e3437a8ea04429995c038b64d654eaa48e.tar taginfo-b46b21e3437a8ea04429995c038b64d654eaa48e.tar.gz |
Add code to create maps for tags.
For a long time we had the capability to create overview maps for keys, but
never for tags (ie key-value-combinations). This commit now adds code to
create maps for frequently used tags. We can't create maps for all tags,
because each map takes about 8k in RAM and there are a lot of tags.
-rw-r--r-- | sources/db/post.sql | 2 | ||||
-rw-r--r-- | sources/db/pre.sql | 9 | ||||
l---------[-rwxr-xr-x] | sources/db/tagstats | bin | 417137 -> 23 bytes | |||
-rwxr-xr-x | sources/db/update.sh | 12 | ||||
-rw-r--r-- | tagstats/tagstats.cpp | 15 | ||||
-rw-r--r-- | tagstats/tagstats_handler.hpp | 85 |
6 files changed, 109 insertions, 14 deletions
diff --git a/sources/db/post.sql b/sources/db/post.sql index 0edba18..35c0a43 100644 --- a/sources/db/post.sql +++ b/sources/db/post.sql @@ -19,6 +19,8 @@ CREATE INDEX key_combinations_key1_idx ON key_combinations (key1); CREATE INDEX key_combinations_key2_idx ON key_combinations (key2); CREATE UNIQUE INDEX key_distributions_key_idx ON key_distributions (key, object_type); +CREATE UNIQUE INDEX tag_distributions_key_value_idx ON tag_distributions (key, value, object_type); + CREATE INDEX tag_combinations_key1_value1_idx ON tag_combinations (key1, value1); CREATE INDEX tag_combinations_key2_value2_idx ON tag_combinations (key2, value2); diff --git a/sources/db/pre.sql b/sources/db/pre.sql index 5654074..905da8d 100644 --- a/sources/db/pre.sql +++ b/sources/db/pre.sql @@ -52,6 +52,15 @@ CREATE TABLE key_distributions ( png BLOB ); +DROP TABLE IF EXISTS tag_distributions; + +CREATE TABLE tag_distributions ( + key VARCHAR, + value VARCHAR, + object_type VARCHAR(1), -- (n)ode, (w)ay, + png BLOB +); + DROP TABLE IF EXISTS tags; CREATE TABLE tags ( diff --git a/sources/db/tagstats b/sources/db/tagstats Binary files differindex 24da09e..f1281f0 100755..120000 --- a/sources/db/tagstats +++ b/sources/db/tagstats diff --git a/sources/db/update.sh b/sources/db/update.sh index 71abd9d..8045efc 100755 --- a/sources/db/update.sh +++ b/sources/db/update.sh @@ -50,8 +50,18 @@ if [ "x" = "x$TAGSTATS" ]; then TAGSTATS="./tagstats" fi +if [ ! -f $DIR/interesting_tags.lst ]; then + echo "File $DIR/interesting_tags.lst missing. Not creating combination statistics." + echo " The next taginfo update should automatically correct this." +fi + +if [ ! -f $DIR/frequent_tags.lst ]; then + echo "File $DIR/frequent_tags.lst missing. Not creating maps for tags." + echo " The next taginfo update should automatically correct this." +fi + #TAGSTATS="valgrind --leak-check=full --show-reachable=yes $TAGSTATS" -$TAGSTATS --tags $DIR/interesting_tags.lst --min-tag-combination-count=$min_tag_combination_count --relation-types $DIR/interesting_relation_types.lst --left=$left --bottom=$bottom --top=$top --right=$right --width=$width --height=$height $PLANETFILE $DATABASE +$TAGSTATS --tags $DIR/interesting_tags.lst --map-tags $DIR/frequent_tags.lst --min-tag-combination-count=$min_tag_combination_count --relation-types $DIR/interesting_relation_types.lst --left=$left --bottom=$bottom --top=$top --right=$right --width=$width --height=$height $PLANETFILE $DATABASE echo "`$DATECMD` Running update_characters... " ./update_characters.rb $DIR diff --git a/tagstats/tagstats.cpp b/tagstats/tagstats.cpp index 91b63ed..41b05ee 100644 --- a/tagstats/tagstats.cpp +++ b/tagstats/tagstats.cpp @@ -1,6 +1,6 @@ /* - Copyright 2012 Jochen Topf <jochen@topf.org>. + Copyright 2012-2014 Jochen Topf <jochen@topf.org>. This file is part of Tagstats. @@ -80,6 +80,7 @@ int main(int argc, char *argv[]) { {"tags", required_argument, 0, 'T'}, {"min-tag-combination-count", required_argument, 0, 'm'}, #endif // TAGSTATS_COUNT_TAG_COMBINATIONS + {"map-tags", required_argument, 0, 'M'}, {"relation-types", required_argument, 0, 'R'}, {"top", required_argument, 0, 't'}, {"right", required_argument, 0, 'r'}, @@ -91,6 +92,7 @@ int main(int argc, char *argv[]) { }; std::string tags_list; + std::string map_tags_list; std::string relation_type_list; double top = 90; @@ -106,9 +108,9 @@ int main(int argc, char *argv[]) { while (true) { int c = getopt_long(argc, argv, #ifdef TAGSTATS_COUNT_TAG_COMBINATIONS - "dHR:t:r:b:l:w:h:T:m:", + "dHR:t:r:b:l:w:h:M:T:m:", #else - "dHR:t:r:b:l:w:h:", + "dHR:t:r:b:l:w:h:M:", #endif // TAGSTATS_COUNT_TAG_COMBINATIONS long_options, 0); if (c == -1) { @@ -123,6 +125,9 @@ int main(int argc, char *argv[]) { case 'T': tags_list = optarg; break; + case 'M': + map_tags_list = optarg; + break; case 'm': min_tag_combination_count = atoi(optarg); break; @@ -162,7 +167,9 @@ int main(int argc, char *argv[]) { Osmium::OSMFile infile(argv[optind]); Sqlite::Database db(argv[optind+1]); MapToInt<rough_position_t> map_to_int(left, bottom, right, top, width, height); - TagStatsHandler handler(db, tags_list, relation_type_list, map_to_int, min_tag_combination_count); + TagStatsHandler handler(db, tags_list, map_tags_list, relation_type_list, map_to_int, min_tag_combination_count); Osmium::Input::read(infile, handler); + + google::protobuf::ShutdownProtobufLibrary(); } diff --git a/tagstats/tagstats_handler.hpp b/tagstats/tagstats_handler.hpp index 4069dd1..9c04dd9 100644 --- a/tagstats/tagstats_handler.hpp +++ b/tagstats/tagstats_handler.hpp @@ -3,7 +3,7 @@ /* - Copyright 2012 Jochen Topf <jochen@topf.org>. + Copyright 2012-2014 Jochen Topf <jochen@topf.org>. This file is part of Tagstats. @@ -181,6 +181,7 @@ public: }; // class KeyValueStats typedef google::sparse_hash_map<const char *, KeyValueStats *, djb2_hash, eqstr> key_value_hash_map_t; +typedef google::sparse_hash_map<const char *, GeoDistribution *, djb2_hash, eqstr> key_value_geodistribution_hash_map_t; #endif // TAGSTATS_COUNT_TAG_COMBINATIONS struct RelationRoleStats { @@ -253,6 +254,8 @@ class TagStatsHandler : public Osmium::Handler::Base { key_value_hash_map_t m_key_value_stats; #endif // TAGSTATS_COUNT_TAG_COMBINATIONS + key_value_geodistribution_hash_map_t m_key_value_geodistribution; + relation_type_stats_map_t m_relation_type_stats; time_t m_max_timestamp; @@ -332,7 +335,7 @@ class TagStatsHandler : public Osmium::Handler::Base { } #endif // TAGSTATS_COUNT_TAG_COMBINATIONS - void _print_and_clear_distribution_images(bool for_nodes) { + void _print_and_clear_key_distribution_images(bool for_nodes) { int sum_size=0; Sqlite::Statement statement_insert_into_key_distributions(m_database, "INSERT INTO key_distributions (key, object_type, png) VALUES (?, ?, ?);"); @@ -363,7 +366,46 @@ class TagStatsHandler : public Osmium::Handler::Base { } std::cerr << "gridcells_all: " << GeoDistribution::count_all_set_cells() << std::endl; - std::cerr << "sum of location image sizes: " << sum_size << std::endl; + std::cerr << "sum of location image sizes: " << sum_size << " bytes\n"; + + m_database.commit(); + } + + void _print_and_clear_tag_distribution_images(bool for_nodes) { + int sum_size=0; + + Sqlite::Statement statement_insert_into_tag_distributions(m_database, "INSERT INTO tag_distributions (key, value, object_type, png) VALUES (?, ?, ?, ?);"); + m_database.begin_transaction(); + + for (key_value_geodistribution_hash_map_t::const_iterator it = m_key_value_geodistribution.begin(); it != m_key_value_geodistribution.end(); it++) { + GeoDistribution* geo = it->second; + + int size; + void* ptr = geo->create_png(&size); + sum_size += size; + + std::vector<std::string> kv; + boost::split(kv, it->first, boost::is_any_of("=")); + kv.push_back(""); // if there is no = in key, make sure there is an empty value + + statement_insert_into_tag_distributions + .bind_text(kv[0].c_str()) // column: key + .bind_text(kv[1].c_str()) // column: value + .bind_text(for_nodes ? "n" : "w") // column: object_type + .bind_blob(ptr, size) // column: png + .execute(); + + geo->free_png(ptr); + + if (for_nodes) { + geo->clear(); + } else { + delete geo; + } + } + + std::cerr << "gridcells_all: " << GeoDistribution::count_all_set_cells() << std::endl; + std::cerr << "sum of location image sizes: " << sum_size << " bytes\n"; m_database.commit(); } @@ -411,8 +453,17 @@ class TagStatsHandler : public Osmium::Handler::Base { } stat->update(it->value(), object, m_string_store); + std::string keyvalue = it->key(); + keyvalue += "="; + keyvalue += it->value(); + if (object.type() == NODE) { - stat->distribution.add_coordinate(m_map_to_int(static_cast<const Osmium::OSM::Node&>(object).position())); + rough_position_t location = m_map_to_int(static_cast<const Osmium::OSM::Node&>(object).position()); + stat->distribution.add_coordinate(location); + key_value_geodistribution_hash_map_t::iterator gd_it = m_key_value_geodistribution.find(keyvalue.c_str()); + if (gd_it != m_key_value_geodistribution.end()) { + gd_it->second->add_coordinate(location); + } } #ifdef TAGSTATS_GEODISTRIBUTION_FOR_WAYS else if (object.type() == WAY) { @@ -421,7 +472,12 @@ class TagStatsHandler : public Osmium::Handler::Base { // coordinates of all nodes? const Osmium::OSM::WayNodeList& wnl = static_cast<const Osmium::OSM::Way&>(object).nodes(); if (!wnl.empty()) { - stat->distribution.add_coordinate(m_storage[wnl.front().ref()]); + rough_position_t location = m_storage[wnl.front().ref()]; + stat->distribution.add_coordinate(location); + key_value_geodistribution_hash_map_t::iterator gd_it = m_key_value_geodistribution.find(keyvalue.c_str()); + if (gd_it != m_key_value_geodistribution.end()) { + gd_it->second->add_coordinate(location); + } } } #endif // TAGSTATS_GEODISTRIBUTION_FOR_WAYS @@ -446,7 +502,7 @@ class TagStatsHandler : public Osmium::Handler::Base { public: - TagStatsHandler(Sqlite::Database& database, const std::string& tags_list, const std::string& relation_type_list, MapToInt<rough_position_t>& map_to_int, unsigned int min_tag_combination_count) : + TagStatsHandler(Sqlite::Database& database, const std::string& tags_list, const std::string& map_tags_list, const std::string& relation_type_list, MapToInt<rough_position_t>& map_to_int, unsigned int min_tag_combination_count) : Base(), m_min_tag_combination_count(min_tag_combination_count), m_max_timestamp(0), @@ -458,13 +514,20 @@ public: , m_storage() #endif { + std::string key_value; + #ifdef TAGSTATS_COUNT_TAG_COMBINATIONS std::ifstream tags_list_file(tags_list.c_str(), std::ifstream::in); - std::string key_value; while (tags_list_file >> key_value) { m_key_value_stats[m_string_store.add(key_value.c_str())] = new KeyValueStats(); } #endif // TAGSTATS_COUNT_TAG_COMBINATIONS + + std::ifstream map_tags_list_file(map_tags_list.c_str(), std::ifstream::in); + while (map_tags_list_file >> key_value) { + m_key_value_geodistribution[m_string_store.add(key_value.c_str())] = new GeoDistribution(); + } + std::ifstream relation_type_list_file(relation_type_list.c_str(), std::ifstream::in); std::string type; while (relation_type_list_file >> type) { @@ -515,7 +578,10 @@ public: .execute(); m_database.commit(); - _print_and_clear_distribution_images(true); + gdFree(ptr); + + _print_and_clear_key_distribution_images(true); + _print_and_clear_tag_distribution_images(true); timer = time(0); _timer_info("dumping images"); _print_memory_usage(); @@ -528,7 +594,8 @@ public: void after_ways() { _timer_info("processing ways"); #ifdef TAGSTATS_GEODISTRIBUTION_FOR_WAYS - _print_and_clear_distribution_images(false); + _print_and_clear_key_distribution_images(false); + _print_and_clear_tag_distribution_images(false); #endif _print_memory_usage(); } |