1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
[[!meta title="ikiwiki-wordpress-import"]]
I modified the script a bit so categories and tags would actually show up in the output file.
<pre>
#!/usr/bin/env python
"""
Purpose:
Wordpress-to-Ikiwiki import tool
Copyright:
Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Usage: run --help as an argument with this script.
Notes:
I added some extra bits to include the [[!tag foo]] stuff in the post,
as it wasn't before, at all. I'll diff the versions out so you can see
the mess I made :).
"""
import os, sys
import time
import re
from BeautifulSoup import BeautifulSoup
import codecs, htmlentitydefs
codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
% htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
def main(name, email, subdir, branch='master'):
soup = BeautifulSoup(sys.stdin.read())
# Regular expression to match stub in URL.
stub_pattern = re.compile(r'.*\/(.+)\/$')
for x in soup.findAll('item'):
# Ignore draft posts
if x.find('wp:status').string != 'publish': continue
match = stub_pattern.match(x.guid.string)
if match:
stub = match.groups()[0]
else:
# Fall back to our own stubs
stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
content = '[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
content += x.find('content:encoded').string.replace('\r\n', '\n')
# categories = x.findAll('category')
# categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
# categories = x.findAll({'category':True}, domain=["category", "tag"])
# categories = x.findAll({'category':True}, nicename=True)
"""
We do it differently here because we have duplicates otherwise.
Take a look:
<category><![CDATA[Health]]></category>
<category domain="category" nicename="health"><![CDATA[Health]]></category>
If we do the what original did, we end up with all tags and cats doubled.
Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
I'd much rather have the value of 'nicename', and tried, but my
python skillz are extremely limited....
"""
categories = x.findAll('category', nicename=True)
if categories:
content += "\n"
for cat in categories:
# remove 'tags/' because we have a 'tagbase' set.
# your choice: 'tag', or 'taglink'
# content += "\n[[!tag %s]]" % (cat.string.replace(' ', '-'))
content += "\n[[!taglink %s]]" % (cat.string.replace(' ', '-'))
# print >>sys.stderr, cat.string.replace(' ', '-')
# moved this thing down
data = content.encode('ascii', 'html_replace')
print "commit refs/heads/%s" % branch
print "committer %s <%s> %d +0000" % (name, email, timestamp)
print "data %d" % len(commit_msg)
print commit_msg
print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
print "data %d" % len(data)
print data
if __name__ == "__main__":
if len(sys.argv) not in (4, 5):
print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
else:
main(*sys.argv[1:])
</pre>
|