aboutsummaryrefslogtreecommitdiff
path: root/doc/tips/importing_posts_from_wordpress
diff options
context:
space:
mode:
authorsimonraven <simonraven@web>2009-04-28 17:46:21 -0400
committerJoey Hess <joey@kitenet.net>2009-04-28 17:46:21 -0400
commitd6b095b06cf94463bc505d728e8281a6b5e03a03 (patch)
tree6790a1e872da266ea6ff7f9a90bafe84f1bb0d8d /doc/tips/importing_posts_from_wordpress
parent0e8f7c336dc3ecd2367cbe8dd6c091d1989b2491 (diff)
downloadikiwiki-d6b095b06cf94463bc505d728e8281a6b5e03a03.tar
ikiwiki-d6b095b06cf94463bc505d728e8281a6b5e03a03.tar.gz
Diffstat (limited to 'doc/tips/importing_posts_from_wordpress')
-rw-r--r--doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn114
1 files changed, 111 insertions, 3 deletions
diff --git a/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn b/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn
index aa1135b5d..0c0527f2c 100644
--- a/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn
+++ b/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn
@@ -113,12 +113,120 @@ if __name__ == "__main__":
</pre>
-----
-(I don't know why the tag down there says %s other than it's reading in from the &lt;pre&gt;'d script...)
-
-I have another version of the script, which scans for &lt;pubDate&gt; and inserts that as a \[[!meta date="foodate"]]. Still work in progress since I just got back to it today, after adding in a rough idea. I'll post a link to it here when it's "good enough for me".
+I have another version of the script, which uses the `timestamp` from the script, and inserts that as a \[[!meta date="foodate"]]. I'm posting it here just in case I happen to be doing something to the httpd.
(Hopefully I've escaped everything properly; if I missed something, check the source.)
+-----
+<pre>
+#!/usr/bin/env python
+
+"""
+ Purpose:
+ Wordpress-to-Ikiwiki import tool
+
+ Copyright:
+ Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Usage: run --help as an argument with this script.
+
+ Notes:
+ I added some extra bits to include the \[[!tag foo]] stuff in the post,
+ as it wasn't before, at all. I'll diff the versions out so you can see
+ the mess I made :).
+
+"""
+
+import os, sys
+import time
+import re
+
+from datetime import datetime
+from BeautifulSoup import BeautifulSoup
+
+import codecs, htmlentitydefs
+
+codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
+ % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
+
+def main(name, email, subdir, branch='master'):
+ soup = BeautifulSoup(sys.stdin.read())
+
+ # Regular expression to match stub in URL.
+ stub_pattern = re.compile(r'.*\/(.+)\/$')
+
+ for x in soup.findAll('item'):
+ # Ignore draft posts
+ if x.find('wp:status').string != 'publish': continue
+
+ match = stub_pattern.match(x.guid.string)
+ if match:
+ stub = match.groups()[0]
+ else:
+ # Fall back to our own stubs
+ stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
+
+ commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
+ timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
+ content = '\[[!meta title="%s"]]\n' % (x.title.string.replace('"', r'\"'))
+ content += "\[[!meta date=\"%s\"]]\n" % datetime.fromtimestamp(timestamp)
+ content += x.find('content:encoded').string.replace('\r\n', '\n')
+
+ """
+ We do it differently here because we have duplicates otherwise.
+ Take a look:
+ &lt;category&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
+ &lt;category domain="category" nicename="health"&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
+
+ If we do the what original did, we end up with all tags and cats doubled.
+ Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
+ I'd much rather have the value of 'nicename', and tried, but my
+ python skillz are extremely limited....
+ """
+ categories = x.findAll('category', nicename=True)
+ if categories:
+ content += "\n"
+ for cat in categories:
+ # remove 'tags/' because we have a 'tagbase' set.
+ # your choice: 'tag', or 'taglink'
+ # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
+ content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
+ # this is just debugging, and for fun
+ # print >>sys.stderr, cat.string.replace(' ', '-')
+
+ # moved this thing down
+ data = content.encode('ascii', 'html_replace')
+ print "commit refs/heads/%s" % branch
+ print "committer %s &lt;%s&gt; %d +0000" % (name, email, timestamp)
+ print "data %d" % len(commit_msg)
+ print commit_msg
+ print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
+ print "data %d" % len(data)
+ print data
+
+if __name__ == "__main__":
+ if len(sys.argv) not in (4, 5):
+ print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
+ else:
+ main(*sys.argv[1:])
+
+</pre>
+-----
+
+
[[!tag wordpress]]
[[!tag python]]
[[!tag conversion]]