aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNick Mathewson <nickm@torproject.org>2007-08-21 05:37:24 +0000
committerNick Mathewson <nickm@torproject.org>2007-08-21 05:37:24 +0000
commit7dbe7fd4d86f202714ce110b46f59e6ed243af16 (patch)
tree374bb598df817e4f60b80da4b22d2360f912ba19 /src
parent8cb6b2bc74c037331f7da26d4f3f92b34b4b9b98 (diff)
downloadtor-7dbe7fd4d86f202714ce110b46f59e6ed243af16.tar
tor-7dbe7fd4d86f202714ce110b46f59e6ed243af16.tar.gz
r14758@catbus: nickm | 2007-08-21 01:36:03 -0400
Finish implementing and documenting proposal 108: Authorities now use MTBF data to set their stability flags, once they have at least 4 days of data to use. svn:r11240
Diffstat (limited to 'src')
-rw-r--r--src/or/dirserv.c49
-rw-r--r--src/or/or.h1
-rw-r--r--src/or/rephist.c107
3 files changed, 137 insertions, 20 deletions
diff --git a/src/or/dirserv.c b/src/or/dirserv.c
index 4762b9980..1401a6c90 100644
--- a/src/or/dirserv.c
+++ b/src/or/dirserv.c
@@ -1500,6 +1500,9 @@ should_generate_v2_networkstatus(void)
* network using allegedly high-uptime nodes, displacing all the
* current guards. */
#define UPTIME_TO_GUARANTEE_STABLE (3600*24*30)
+/* If a router's MTBF is at least this value, then it is always stable.
+ * See above. */
+#define MTBF_TO_GUARANTEE_STABLE (60*60*24*10)
/** Similarly, we protect sufficiently fast nodes from being pushed
* out of the set of Fast nodes. */
#define BANDWIDTH_TO_GUARANTEE_FAST (100*1024)
@@ -1511,6 +1514,8 @@ should_generate_v2_networkstatus(void)
* dirserv_compute_performance_thresholds, and used by
* generate_v2_networkstatus */
static uint32_t stable_uptime = 0; /* start at a safe value */
+static double stable_mtbf = 0.0;
+static int enough_mtbf_info = 0;
static uint32_t fast_bandwidth = 0;
static uint32_t guard_bandwidth_including_exits = 0;
static uint32_t guard_bandwidth_excluding_exits = 0;
@@ -1539,10 +1544,20 @@ dirserv_thinks_router_is_unreliable(time_t now,
int need_uptime, int need_capacity)
{
if (need_uptime) {
- int uptime = real_uptime(router, now);
- if ((unsigned)uptime < stable_uptime &&
- (unsigned)uptime < UPTIME_TO_GUARANTEE_STABLE)
- return 1;
+ if (!enough_mtbf_info) {
+ /* XXXX Once most authorities are on v3, we should change the rule from
+ * "use uptime if we don't have mtbf data" to "don't advertise Stable on
+ * v3 if we don't have enough mtbf data." */
+ int uptime = real_uptime(router, now);
+ if ((unsigned)uptime < stable_uptime &&
+ (unsigned)uptime < UPTIME_TO_GUARANTEE_STABLE)
+ return 1;
+ } else {
+ double mtbf =
+ rep_hist_get_stability(router->cache_info.identity_digest, now);
+ if (mtbf < stable_mtbf && mtbf < MTBF_TO_GUARANTEE_STABLE)
+ return 1;
+ }
}
if (need_capacity) {
uint32_t bw = router_get_advertised_bandwidth(router);
@@ -1563,6 +1578,17 @@ _compare_uint32(const void **a, const void **b)
return 0;
}
+/** Helper: returns a tristate based on comparing **(double**)<b>a</b>
+ * to **(double**)<b>b</b>. */
+static int
+_compare_double(const void **a, const void **b)
+{
+ double first = **(double **)a, second = **(double **)b;
+ if (first < second) return -1;
+ if (first > second) return 1;
+ return 0;
+}
+
/** Look through the routerlist, and assign the median uptime of running valid
* servers to stable_uptime, and the relative bandwidth capacities to
* fast_bandwidth and guard_bandwidth. Set total_bandwidth to the total
@@ -1572,7 +1598,7 @@ _compare_uint32(const void **a, const void **b)
static void
dirserv_compute_performance_thresholds(routerlist_t *rl)
{
- smartlist_t *uptimes, *bandwidths, *bandwidths_excluding_exits;
+ smartlist_t *uptimes, *mtbfs, *bandwidths, *bandwidths_excluding_exits;
time_t now = time(NULL);
/* initialize these all here, in case there are no routers */
@@ -1585,16 +1611,21 @@ dirserv_compute_performance_thresholds(routerlist_t *rl)
total_exit_bandwidth = 0;
uptimes = smartlist_create();
+ mtbfs = smartlist_create();
bandwidths = smartlist_create();
bandwidths_excluding_exits = smartlist_create();
+ /* XXXX020 we should just use arrays and qsort. */
SMARTLIST_FOREACH(rl->routers, routerinfo_t *, ri, {
if (router_is_active(ri, now)) {
uint32_t *up = tor_malloc(sizeof(uint32_t));
uint32_t *bw = tor_malloc(sizeof(uint32_t));
+ uint32_t *mtbf = tor_malloc(sizeof(double));
ri->is_exit = exit_policy_is_general_exit(ri->exit_policy);
*up = (uint32_t) real_uptime(ri, now);
smartlist_add(uptimes, up);
+ *mtbf = rep_hist_get_stability(ri->cache_info.identity_digest, now);
+ smartlist_add(mtbfs, mtbf);
*bw = router_get_advertised_bandwidth(ri);
total_bandwidth += *bw;
if (ri->is_exit && !ri->is_bad_exit) {
@@ -1609,6 +1640,7 @@ dirserv_compute_performance_thresholds(routerlist_t *rl)
});
smartlist_sort(uptimes, _compare_uint32);
+ smartlist_sort(mtbfs, _compare_double);
smartlist_sort(bandwidths, _compare_uint32);
smartlist_sort(bandwidths_excluding_exits, _compare_uint32);
@@ -1616,6 +1648,11 @@ dirserv_compute_performance_thresholds(routerlist_t *rl)
stable_uptime = *(uint32_t*)smartlist_get(uptimes,
smartlist_len(uptimes)/2);
+ if (smartlist_len(mtbfs))
+ stable_mtbf = *(double*)smartlist_get(mtbfs,
+ smartlist_len(mtbfs)/2);
+ enough_mtbf_info = rep_hist_have_measured_enough_stability();
+
if (smartlist_len(bandwidths)) {
fast_bandwidth = *(uint32_t*)smartlist_get(bandwidths,
smartlist_len(bandwidths)/8);
@@ -1640,9 +1677,11 @@ dirserv_compute_performance_thresholds(routerlist_t *rl)
(unsigned long)guard_bandwidth_excluding_exits);
SMARTLIST_FOREACH(uptimes, uint32_t *, up, tor_free(up));
+ SMARTLIST_FOREACH(mtbfs, double *, mtbf, tor_free(mtbf));
SMARTLIST_FOREACH(bandwidths, uint32_t *, bw, tor_free(bw));
SMARTLIST_FOREACH(bandwidths_excluding_exits, uint32_t *, bw, tor_free(bw));
smartlist_free(uptimes);
+ smartlist_free(mtbfs);
smartlist_free(bandwidths);
smartlist_free(bandwidths_excluding_exits);
}
diff --git a/src/or/or.h b/src/or/or.h
index 809d69695..409af8ade 100644
--- a/src/or/or.h
+++ b/src/or/or.h
@@ -3124,6 +3124,7 @@ int rep_hist_load_mtbf_data(time_t now);
time_t rep_hist_downrate_old_runs(time_t now);
double rep_hist_get_stability(const char *id, time_t when);
+int rep_hist_have_measured_enough_stability(void);
void rep_hist_note_used_port(uint16_t port, time_t now);
smartlist_t *rep_hist_get_predicted_ports(time_t now);
diff --git a/src/or/rephist.c b/src/or/rephist.c
index 0474ee644..254f5f3b2 100644
--- a/src/or/rephist.c
+++ b/src/or/rephist.c
@@ -20,9 +20,18 @@ static void hs_usage_init(void);
uint64_t rephist_total_alloc=0;
uint32_t rephist_total_num=0;
+/** If the total weighted run count of all runs for a router ever falls
+ * below this amount, the router can be treated as having 0 MTBF. */
#define STABILITY_EPSILON 0.0001
-#define STABILITY_ALPHA 0.9
+/** Value by which to discount all old intervals for MTBF purposses. This
+ * is compounded every STABILITY_INTERVAL. */
+#define STABILITY_ALPHA 0.95
+/** Interval at which to discount all old intervals for MTBF purposes. */
#define STABILITY_INTERVAL (12*60*60)
+/* (This combination of ALPHA, INTERVAL, and EPSILON makes it so that an
+ * interval that just ended counts twice as much as one that ended a week ago,
+ * 20X as much as one that ended a month ago, and routers that have had no
+ * uptime data for about half a year will get forgotten.) */
/** History of an OR-\>OR link. */
typedef struct link_history_t {
@@ -56,18 +65,30 @@ typedef struct or_history_t {
time_t up_since;
/** If nonzero, we have been unable to connect since this time. */
time_t down_since;
- /** DOCDOC */
+
+
+ /* === For MTBF tracking: */
+ /** Weighted sum total of all times that this router has been online.
+ */
unsigned long weighted_run_length;
+ /** If the router is now online (according to stability-checking rules),
+ * when did it come online? */
time_t start_of_run;
+ /** Sum of weights for runs in weighted_run_length. */
double total_run_weights;
+
/** Map from hex OR2 identity digest to a link_history_t for the link
* from this OR to OR2. */
digestmap_t *link_history_map;
} or_history_t;
-/** DOCDOC */
+/** When did we last multiply all routers' weighted_run_length and
+ * total_run_weights by STABILITY_ALPHA? */
static time_t stability_last_downrated = 0;
+/** */
+static time_t started_tracking_stability = 0;
+
/** Map from hex OR identity digest to or_history_t. */
static digestmap_t *history_map = NULL;
@@ -163,7 +184,9 @@ rep_hist_init(void)
hs_usage_init();
}
-/** DOCDOC */
+/** Helper: note that we are no longer connected to the router with history
+ * <b>hist</b>. If <b>failed</b>, the connection failed; otherwise, it was
+ * closed correctly. */
static void
mark_or_down(or_history_t *hist, time_t when, int failed)
{
@@ -176,7 +199,8 @@ mark_or_down(or_history_t *hist, time_t when, int failed)
}
}
-/** DOCDOC */
+/** Helper: note that we are connected to the router with history
+ * <b>hist</b>. */
static void
mark_or_up(or_history_t *hist, time_t when)
{
@@ -259,6 +283,8 @@ void
rep_hist_note_router_reachable(const char *id, time_t when)
{
or_history_t *hist = get_or_history(id);
+ if (!started_tracking_stability)
+ started_tracking_stability = time(NULL);
if (hist && !hist->start_of_run) {
hist->start_of_run = when;
}
@@ -270,6 +296,8 @@ void
rep_hist_note_router_unreachable(const char *id, time_t when)
{
or_history_t *hist = get_or_history(id);
+ if (!started_tracking_stability)
+ started_tracking_stability = time(NULL);
if (hist && hist->start_of_run) {
/*XXXX020 treat failure specially? */
long run_length = when - hist->start_of_run;
@@ -279,7 +307,8 @@ rep_hist_note_router_unreachable(const char *id, time_t when)
}
}
-/**DOCDOC*/
+/** Helper: Discount all old MTBF data, if it is time to do so. Return
+ * the time at which we should next discount MTBF data. */
time_t
rep_hist_downrate_old_runs(time_t now)
{
@@ -296,11 +325,13 @@ rep_hist_downrate_old_runs(time_t now)
if (stability_last_downrated + STABILITY_INTERVAL > now)
return stability_last_downrated + STABILITY_INTERVAL;
+ /* Okay, we should downrate the data. By how much? */
while (stability_last_downrated + STABILITY_INTERVAL < now) {
stability_last_downrated += STABILITY_INTERVAL;
alpha *= STABILITY_ALPHA;
}
+ /* Multiply every w_r_l, t_r_w pair by alpha. */
for (orhist_it = digestmap_iter_init(history_map);
!digestmap_iter_done(orhist_it);
orhist_it = digestmap_iter_next(history_map,orhist_it)) {
@@ -315,7 +346,7 @@ rep_hist_downrate_old_runs(time_t now)
return stability_last_downrated + STABILITY_INTERVAL;
}
-/** DOCDOC */
+/** Helper: Return the weighted MTBF of the router with history <b>hist</b>. */
static double
get_stability(or_history_t *hist, time_t when)
{
@@ -323,16 +354,21 @@ get_stability(or_history_t *hist, time_t when)
double total_weights = hist->total_run_weights;
if (hist->start_of_run) {
+ /* We're currently in a run. Let total and total_weights hold the values
+ * they would hold if the current run were to end now. */
total += (when-hist->start_of_run);
total_weights += 1.0;
}
- if (total_weights < STABILITY_EPSILON)
+ if (total_weights < STABILITY_EPSILON) {
+ /* Round down to zero, and avoid divide-by-zero. */
return 0.0;
+ }
return total / total_weights;
}
-/**DOCDOC*/
+/** Return an estimated MTBF for the router whose identity digest is
+ * <b>id</b>. Return 0 if the router is unknown. */
double
rep_hist_get_stability(const char *id, time_t when)
{
@@ -343,6 +379,16 @@ rep_hist_get_stability(const char *id, time_t when)
return get_stability(hist, when);
}
+/** Return true if we've been measuring MTBFs for long enough to
+ * prounounce on Stability. */
+int
+rep_hist_have_measured_enough_stability(void)
+{
+ /* XXXX020 This doesn't do so well when we change our opinion
+ * as to whether we're tracking router stability. */
+ return started_tracking_stability < time(NULL) - 4*60*60;
+}
+
/** Remember that we successfully extended from the OR with identity
* digest <b>from_id</b> to the OR with identity digest
* <b>to_name</b>.
@@ -502,7 +548,8 @@ rep_history_clean(time_t before)
}
}
-/** DOCDOC */
+/** Return a newly allocated string holding the filename in which we store
+ * MTBF information. */
static char *
get_mtbf_filename(void)
{
@@ -513,7 +560,7 @@ get_mtbf_filename(void)
return fn;
}
-/** DOCDOC */
+/** Write MTBF data to disk. Returns 0 on success, negative on failure. */
int
rep_hist_record_mtbf_data(void)
{
@@ -526,6 +573,16 @@ rep_hist_record_mtbf_data(void)
void *or_history_p;
or_history_t *hist;
+ /* File format is:
+ * FormatLine *KeywordLine Data
+ *
+ * FormatLine = "format 1" NL
+ * KeywordLine = Keyword SP Arguments NL
+ * Data = "data" NL *RouterMTBFLine "." NL
+ * RouterMTBFLine = Fingerprint SP WeightedRunLen SP
+ * TotalRunWeights [SP S=StartRunTime] NL
+ */
+
lines = smartlist_create();
smartlist_add(lines, tor_strdup("format 1\n"));
@@ -534,6 +591,11 @@ rep_hist_record_mtbf_data(void)
tor_snprintf(buf, sizeof(buf), "stored-at %s\n", time_buf);
smartlist_add(lines, tor_strdup(buf));
+ if (started_tracking_stability) {
+ format_iso_time(time_buf, started_tracking_stability);
+ tor_snprintf(buf, sizeof(buf), "tracked-since %s\n", time_buf);
+ smartlist_add(lines, tor_strdup(buf));
+ }
if (stability_last_downrated) {
format_iso_time(time_buf, stability_last_downrated);
tor_snprintf(buf, sizeof(buf), "last-downrated %s\n", time_buf);
@@ -579,7 +641,8 @@ rep_hist_record_mtbf_data(void)
}
}
-/** DOCDOC */
+/** Load MTBF data from disk. Returns 0 on success or recoverable error, -1
+ * on failure. */
int
rep_hist_load_mtbf_data(time_t now)
{
@@ -587,7 +650,8 @@ rep_hist_load_mtbf_data(time_t now)
smartlist_t *lines;
const char *line = NULL;
int r=0, i;
- time_t last_downrated = 0, stored_at = 0;
+ time_t last_downrated = 0, stored_at = 0, tracked_since = 0;
+ time_t latest_possible_start = now;
{
char *filename = get_mtbf_filename();
@@ -618,9 +682,16 @@ rep_hist_load_mtbf_data(time_t now)
log_warn(LD_GENERAL,"Couldn't parse stored time in mtbf "
"history file.");
}
+ if (!strcmpstart(line, "tracked-since ")) {
+ if (parse_iso_time(line+strlen("tracked-since "), &tracked_since)<0)
+ log_warn(LD_GENERAL,"Couldn't parse started-tracking time in mtbf "
+ "history file.");
+ }
}
if (last_downrated > now)
last_downrated = now;
+ if (tracked_since > now)
+ tracked_since = now;
if (!stored_at) {
log_warn(LD_GENERAL, "No stored time recorded.");
@@ -635,7 +706,7 @@ rep_hist_load_mtbf_data(time_t now)
char hexbuf[HEX_DIGEST_LEN+1];
char timebuf[ISO_TIME_LEN+1];
time_t start_of_run = 0;
- unsigned long wrl;
+ long wrl;
double trw;
int n;
or_history_t *hist;
@@ -643,7 +714,7 @@ rep_hist_load_mtbf_data(time_t now)
if (!strcmp(line, "."))
break;
/* XXXX020 audit the heck out of my scanf usage. */
- n = sscanf(line, "%40s %lu %lf S=%10s %8s",
+ n = sscanf(line, "%40s %ld %lf S=%10s %8s",
hexbuf, &wrl, &trw, timebuf, timebuf+11);
if (n != 3 && n != 5) {
log_warn(LD_GENERAL, "Couldn't scan line %s", escaped(line));
@@ -668,6 +739,8 @@ rep_hist_load_mtbf_data(time_t now)
long run_length = stored_at - start_of_run;
hist->start_of_run = now - run_length;
}
+ if (hist->start_of_run < latest_possible_start + wrl)
+ latest_possible_start = hist->start_of_run - wrl;
hist->weighted_run_length = wrl;
hist->total_run_weights = trw;
@@ -675,7 +748,11 @@ rep_hist_load_mtbf_data(time_t now)
if (strcmp(line, "."))
log_warn(LD_GENERAL, "Truncated MTBF file.");
+ if (!tracked_since)
+ tracked_since = latest_possible_start;
+
stability_last_downrated = last_downrated;
+ started_tracking_stability = tracked_since;
goto done;
err: