Tweak the changelog formatter a little.

(I had a bad clone of Knuth's algorithm sitting around in an old code repository of mine. I added orphan detection and smarter hyphenation; it seems to give marginally better results than we had before.)
author: Nick Mathewson <nickm@torproject.org> 2014-05-02 12:50:23 -0400
committer: Nick Mathewson <nickm@torproject.org> 2014-05-02 12:50:23 -0400
commit: 8a173635bcd85ad7375d996fb39efeaddd8c611d (patch)
tree: 995d20f608880f610a167c182118129d9c1d21b4
parent: 29b7397ebe0d2888c9c27f51d3bcdf0d347ee463 (diff)
download: tor-8a173635bcd85ad7375d996fb39efeaddd8c611d.tar
tor-8a173635bcd85ad7375d996fb39efeaddd8c611d.tar.gz
1 files changed, 134 insertions, 5 deletions
diff --git a/scripts/maint/format_changelog.py b/scripts/maint/format_changelog.py
index 6997d958a..35044b318 100755
--- a/scripts/maint/format_changelog.py
+++ b/scripts/maint/format_changelog.py
@@ -12,7 +12,135 @@
 import os
 import re
 import sys
-import textwrap
+
+# ==============================
+# Oh, look!  It's a cruddy approximation to Knuth's elegant text wrapping
+# algorithm, with totally ad hoc parameters!
+#
+# We're trying to minimize:
+#    The total of the cubes of ragged space on underflowed intermediate lines,
+#  PLUS
+#    100 * the fourth power of overflowed characters
+#  PLUS
+#    .1 * a bit more than the cube of ragged space on the last line.
+#
+# We use an obvious dynamic programming algorithm to sorta approximate this.
+# It's not coded right or optimally, but it's fast enough for changelogs
+#
+# (Code found in an old directory of mine, lightly cleaned. -NM)
+
+NO_HYPHENATE=set("""
+pf-divert
+""".split())
+
+LASTLINE_UNDERFLOW_EXPONENT = 1
+LASTLINE_UNDERFLOW_PENALTY = 1
+
+UNDERFLOW_EXPONENT = 3
+UNDERFLOW_PENALTY = 1
+
+OVERFLOW_EXPONENT = 4
+OVERFLOW_PENALTY = 2000
+
+ORPHAN_PENALTY = 10000
+
+def generate_wrapping(words, divisions):
+    lines = []
+    last = 0
+    for i in divisions:
+        w = words[last:i]
+        last = i
+        line = " ".join(w).replace("\xff ","-").replace("\xff","-")
+        lines.append(line)
+    return lines
+
+def wrapping_quality(words, divisions, width1, width2):
+    total = 0.0
+
+    lines = generate_wrapping(words, divisions)
+    for line in lines:
+        length = len(line)
+        if line is lines[0]:
+            width = width1
+        else:
+            width = width2
+
+        if length > width:
+            total += OVERFLOW_PENALTY * (
+                (length - width) ** OVERFLOW_EXPONENT )
+        else:
+            if line is lines[-1]:
+                e,p = (LASTLINE_UNDERFLOW_EXPONENT, LASTLINE_UNDERFLOW_PENALTY)
+                if " " not in line:
+                    total += ORPHAN_PENALTY
+            else:
+                e,p = (UNDERFLOW_EXPONENT, UNDERFLOW_PENALTY)
+
+            total += p * ((width - length) ** e)
+
+    return total
+
+def wrap_graf(words, prefix_len1=0, prefix_len2=0, width=72):
+    wrapping_after = [ (0,), ]
+
+    w1 = width - prefix_len1
+    w2 = width - prefix_len2
+
+    for i in range(1, len(words)+1):
+        best_so_far = None
+        best_score = 1e300
+        for j in range(i):
+            t = wrapping_after[j]
+            t1 = t[:-1] + (i,)
+            t2 = t + (i,)
+            wq1 = wrapping_quality(words, t1, w1, w2)
+            wq2 = wrapping_quality(words, t2, w1, w2)
+
+            if wq1 < best_score:
+                best_so_far = t1
+                best_score = wq1
+            if wq2 < best_score:
+                best_so_far = t2
+                best_score = wq2
+        wrapping_after.append( best_so_far )
+
+    lines = generate_wrapping(words, wrapping_after[-1])
+
+    return lines
+
+def hyphenateable(word):
+    if re.match(r'^[^\d\-].*-', word):
+        stripped = re.sub(r'^\W+','',word)
+        stripped = re.sub(r'\W+$','',word)
+        return stripped not in NO_HYPHENATE
+    else:
+        return False
+
+def split_paragraph(s):
+    "Split paragraph into words; tuned for Tor."
+
+    r = []
+    for word in s.split():
+        if hyphenateable(word):
+            while "-" in word:
+                a,word = word.split("-",1)
+                r.append(a+"\xff")
+        r.append(word)
+    return r
+
+def fill(text, width, initial_indent, subsequent_indent):
+    words = split_paragraph(text)
+    lines = wrap_graf(words, len(initial_indent), len(subsequent_indent),
+                      width)
+    res = [ initial_indent, lines[0], "\n" ]
+    for line in lines[1:]:
+        res.append(subsequent_indent)
+        res.append(line)
+        res.append("\n")
+    return "".join(res)
+
+# ==============================
+
 
 TP_MAINHEAD = 0
 TP_HEADTEXT = 1
@@ -108,10 +236,11 @@ class ChangeLog(object):
         if indent2 == -1:
             indent2 = indent1
         text = " ".join(re.sub(r'\s+', ' ', line.strip()) for line in par)
-        print textwrap.fill(text, width=72,
-                            initial_indent=" "*indent1,
-                            subsequent_indent=" "*indent2,
-                            break_on_hyphens=False)
+
+        sys.stdout.write(fill(text,
+                              width=72,
+                              initial_indent=" "*indent1,
+                              subsequent_indent=" "*indent2))
 
     def dump(self):
         print self.mainhead
author	Nick Mathewson <nickm@torproject.org>	2014-05-02 12:50:23 -0400
committer	Nick Mathewson <nickm@torproject.org>	2014-05-02 12:50:23 -0400
commit	8a173635bcd85ad7375d996fb39efeaddd8c611d (patch)
tree	995d20f608880f610a167c182118129d9c1d21b4
parent	29b7397ebe0d2888c9c27f51d3bcdf0d347ee463 (diff)
download	tor-8a173635bcd85ad7375d996fb39efeaddd8c611d.tar tor-8a173635bcd85ad7375d996fb39efeaddd8c611d.tar.gz