Summary:
It wasn't that hard after all: have a look at:
http://hk.danst0.com
All entries seem to be working fine. Comments, Hierarchies, Dates, all imported.
The RegExp idea helped a lot.
I wrote a python script which analyzes the entries and outputs plain sql commands, which are then imported into the database. The code is really crude (and I mean it), but may be if someone is as stupid as I am to only backup to plain HTML...

Here it is:
Code: Select all
#/usr/bin/env python
# -*- coding: UTF-8 -*-
import os
import glob
import re
import time
path = "blog.danst0.com/"
def replace_month(string):
string = string.replace(u'Januar', "1")
string = string.replace(u'Februar', "2")
string = string.replace(u'März', "3")
string = string.replace(u'April', "4")
string = string.replace(u'Mai', "5")
string = string.replace(u'Juni', "6")
string = string.replace(u'Juli', "7")
string = string.replace(u'August', "8")
string = string.replace(u'September', "9")
string = string.replace(u'Oktober', "10")
string = string.replace(u'November', "11")
string = string.replace(u'Dezember', "12")
return string
def r4body(string):
string = string.replace('\n', '\\n')
string = string.replace('\r', '\\r')
#string = string.replace('//', '\/')
#string = string.replace(':', '\:')
#string = string.replace("\"", "\\""")
#string = string.replace("'", "\'")
# string = string.replace('<br />', '<br />')
string = string.replace("'", "''")
return string
def replace_category_by_id(string):
string = string.replace(u'Allgemeines', "1")
string = string.replace(u'Arbeit & Co.', "2")
string = string.replace(u'Arbeit & Co.', "2")
string = string.replace(u'Asien 2005', "3")
string = string.replace(u'Hong Kong 2004', "4")
string = string.replace(u'Mikro', "5")
string = string.replace(u'Persönliches', "6")
string = string.replace(u'iPod', "6")
return string
if __name__ == "__main__":
print "Starting Parsing"
article_count = 0
a_id = []
a_title = []
a_timestamp = []
a_body = []
a_extended = []
a_category = []
c_id = []
c_entryid = []
c_parentid = []
c_timestamp = []
c_author = []
c_body = []
c_url = []
c_email = []
for infile in glob.glob( os.path.join(path, '*.html') ):
f = open(infile, 'r')
html_code = f.read().decode('ISO-8859-1')
if (html_code.find("""<div class="serendipity_commentsTitle">""") != -1):
m = re.search('<h4 class=\"serendipity_title\"><a.+>(.+)</a>', html_code)
tmp_title = m.group(1)
if tmp_title not in a_title:
article_count += 1
#print article_count, " Reading file: " + infile
a_title.append(tmp_title)
a_id.append(article_count)
m = re.search('<h3 class=\"serendipity_date\">.+, (.+)</h3>', html_code)
tmp_date = replace_month(m.group(1))
m = re.search('<a.+>([0-9]{1,2}:[0-9]{1,2})</a>', html_code)
tmp_time = m.group(1)
#print tmp_title, tmp_date, tmp_time
a_timestamp.append(int(time.mktime(time.strptime(tmp_date + " " + tmp_time, "%d. %m %Y %H:%M"))))
# Search for Extended
m = re.search('<div class=\"serendipity_entry_extended\"><a id=\"extended\"></a>(.+)</div>.+<div class=.serendipity_entryFooter.>', html_code, re.DOTALL)
extended = True
text = ''
try:
text = r4body(m.group(1).strip())
except:
extended = False
a_extended.append(text)
## Search Category
m = re.search('in <a href=\".+?\">(.+?)</a> ', html_code)
tmp = replace_month(m.group(1))
tmp = replace_category_by_id(tmp)
a_category.append(tmp)
## Search for Entry Body
if not extended:
sstring = '<div class=\"serendipity_entry_body\">(.+)</div>.+<div class=.serendipity_entryFooter.>'
else:
sstring = '<div class=\"serendipity_entry_body\">(.+)</div>.+<div class=\"serendipity_entry_extended\">'
m = re.search(sstring , html_code, re.DOTALL)
text = r4body(m.group(1).strip())
a_body.append(text)
if (html_code.find("Keine Trackbacks") == -1):
# Trackbacks bearbeiten
print "Trackback gefunden:", a_id[-1], a_title[-1], infile
if (html_code.find("Noch keine Kommentare") == -1):
# Kommentare bearbeiten
#print "Kommentare gefunden"
lm = re.findall('<a id=\"c([0-9]{1,3})\"></a>', html_code)
c_localid = {}
for m in lm:
# Entry ID
c_entryid.append(a_id[-1])
# Comment ID
tmp = int(m)
c_id.append(tmp)
# Hierarchy
j = re.search('<a href=\"#c' + m + '\" title="Link to comment #[0-9\.]+">#([0-9\.]+)</a>', html_code)
hi = j.group(1)
# Search Parent ID
c_localid[hi] = tmp
tmp_parent = 0
if "." in hi:
tmp_parent = c_localid[hi[0:-2]]
c_parentid.append(tmp_parent)
# Comment Author, URL and Date
sstring = '#' + hi + '</a>(?P<name>.+?)(\(<a href=\"(?P<url>.+?)\".+?\).+?)?am.+?(?P<datum>[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4} [0-9]{1,2}:[0-9]{1,2}).+\(<a href=\"#serendipity_CommentForm\"'
j = re.search(sstring, html_code, re.DOTALL)
c_author.append(r4body(j.group('name').strip().strip('\r').strip('\n')))
tmp_url = ''
try:
tmp_url = j.group('url').strip()
except:
pass
c_url.append(tmp_url)
c_timestamp.append(int(time.mktime(time.strptime(j.group('datum').strip(), "%d.%m.%Y %H:%M"))))
# Find comment body:
sstring = '<a id=\"c' + m + '\"></a>.+?<div class=\"serendipity_commentBody\">(.+?)</div>'
j = re.search(sstring, html_code, re.DOTALL)
c_body.append(r4body(j.group(1)))
#break
#break
#print a_title
f = open('/Users/danst/Desktop/articles.sql', 'w')
table_name = '`serendipity_entries`'
f.write("TRUNCATE TABLE "+ table_name + ";\r\n")
for i, id in enumerate(a_id):
# Columns id title timestamp body comments trackbacks extended exflag author authorid isdraft allow_comments last_modified moderate_comments
if a_extended[i] == "":
exflag = '0'
else:
exflag = '1'
## Count Comments
comment_count = 0
for z in c_entryid:
if z == id:
comment_count += 1
sql = "INSERT INTO " + table_name + " VALUES (" + str(id) + ", " + \
"'" + a_title[i] + "', " + \
str(a_timestamp[i]) + ", " + \
"'" + a_body[i] + "', " + \
str(comment_count) + ", " + \
str(0) + ", " + \
"'" + a_extended[i] + "', " + \
exflag + ", " + \
"'admin'" + ", " + \
"'1'" + ", " + \
"'false'" + ", " + \
"'false'" + ", " + \
"1304420225" + ", " + \
"'false'" + ");"
#print sql
f.write(sql.encode('utf-8') + '\r\n\r\n')
table_name = '`serendipity_entrycat`'
f.write("TRUNCATE TABLE "+ table_name + ";\r\n")
for i, id in enumerate(a_id):
sql = "INSERT INTO " + table_name + " VALUES (" + str(id) + ", " + \
str(a_category[i]) + ");"
f.write(sql.encode('utf-8') + '\r\n\r\n')
table_name = '`serendipity_comments`'
f.write("TRUNCATE TABLE "+ table_name + ";\r\n")
for i, id in enumerate(c_id):
#print i, c_entryid[i], c_parentid[i],
#print c_timestamp[i], c_author[i], c_url[i], c_body[i]
# id entry_id parent_id timestamp title author email url ip body type subscribed status referer
#
#
sql = "INSERT INTO " + table_name + " VALUES ('" + str(id) + "', " + \
"'" + str(c_entryid[i]) + "', " + \
"'" + str(c_parentid[i]) + "', " + \
"'" + str(c_timestamp[i]) + "', " + \
"'', " + \
"'" + c_author[i] + "', " + \
"'', " + \
"'" + c_url[i] + "', " + \
"'', " + \
"'" + c_body[i] + "', " + \
"'NORMAL', " + \
"'false'" + ", " + \
"'approved'" + ", " + \
"'');"
#print sql
f.write(sql.encode('utf-8') + '\r\n\r\n')
f.close()
After importing the only thing I did was to copy the pictures to the uploads folder manually, serendipity automatically added the pictures to the library.
I did not import trackbacks since I only had 2.
Daniel
ps. Overall the programming / import took about 4 hours, and of course much more rewarding than playing the copy'n'paste monkey