<?php
/*
RSS import for WordPress
by Andrew Shearer (awshearer@shearersoftware.com)

For current version and more info, see:
http://www.shearersoftware.com/software/web-tools/wordpress-rss-import/

This script is currently intended to be run from the command line or from the
web after it has been configured by editing variables in the first few lines
of the script.

To use it, first set the $path variable below to a path to an RSS file or
directory containing a blogBrowser archive (one folder per year, one RSS file
per month.) Then run this script from the command line (php import-rss.php).

Examples:

Import an rss.xml file in this directory:
$path = dirname(__FILE__).'/rss.xml';

Import an rss.xml file with a full path specified (Mac OS X full path):
$path = '/Users/testuser/Sites/mysite/rss.xml';

Import a blogBrowser archive in a folder named C:/documents/weblog,
including monthly RSS files such as weblog/2003/12.xml (Windows full path):
$path = 'C:/documents/weblog';

Future improvements: make this runnable from a web browser. Single RSS files
could be handled through uploads, and multi-file blogBrowser archives could be
specified by base URL.

Licensed under the GNU General Public License (GPL).

Revision history:

2004-05-28  ashearer  Timezone support: preserve the author's local timezone;
                      parse timezone offsets for all date fields,
                      falling back to timezone offsets in other fields
                      or a default timezone if not available, use GMT
                      for modification date comparisons.
                      No use of addslashes()--use mysql_escape_string()
                      where appropriate.
2004-05-27  ashearer  Updated for WordPress 1.2: removed $kCreateModDateField
                      option, use post_date_gmt and post_modified_gmt
2003-12-26  ashearer  Improved date conflict resolution with $kUpdatePostsAlways
                      and $kUpdatePostsIfNewer options; added $kTakeNoAction;
                      added more comments
2003-12-22  ashearer  Added blogBrowser archive support; optional mod. dates;
                      mod. date column autocreation
2003-12-21  ashearer  RSS import, initial version


*/

//$path = dirname(__FILE__).'/../archivedir';
//$path = dirname(__FILE__).'/rss.xml';
$path = 'http://www.example.com/rss.xml';

$kSetModDateField = true;       // import post_modified field from RSS file?
$kUpdatePostsAlways = false;    // true to import RSS version even if it replaces current version
$kUpdatePostsIfNewer = true;    // if true, in case of conflict, use newer version; requires post_modified field
$kTakeNoAction = false;          // like -n flag; report actions but don't actually change DB
$kDefaultTimezoneOffset = (strtotime('2001-01-01 00:00:00 GMT') - strtotime('2001-01-01 00:00:00')) / 60;
// Offset from UTC, in minutes (west is negative). Used when we receive dates in GMT only,
// to work back to local time, or dates without timezones, to work forward to GMT.
// Above calculation gets the host computer's timezone; to put in a fixed timezone,
// adjust and uncomment the following line.
// $kDefaultTimezoneOffset = 0;

$kExcludeCategories = array('Testing' => '');

error_reporting(E_ALL);

class
post {
    var
$title;
    var
$content;
    var
$createDate;
    var
$createDateTZ;
    var
$modDate;
    var
$modDateTZ;
    var
$guid;
    var
$categories;
    var
$postTitle;
}

$currentPost = null;
$currentText = '';

function
parseDateISO8601($input) {
    
// returns the date in SQL (MySQL, at least)-compatible text format
    
return substr($input, 0, 10) . ' ' . substr($input, 11, 8);
}

function
parseDateISO8601TZ($input) {
    
// Returns an array.
    // Element 1: the datetime in the original local timezone (in secs since the epoch).
    // Element 2: the timezone offset from GMT in minutes, or null if unknown. West is negative.
    
if (substr($input, 11) == 'T') {
        
$datetime = strtotime(substr($input, 0, 10) . ' ' . substr($input, 11, 8));
        if (
strlen($input) < 19) {
            
$timezoneOffset = null;
        }
        elseif (
substr($input, 19, 1) == 'Z') {
            
$timezoneOffset = 0;
        }
        else {
            
$timezoneOffset = intval(substr($input, 19, 3), 10) + intval(substr($input, 23, 2), 10) / 60;
        }
    }
    else {
        
// malformed W3CDTF; punt to strtotime
        
$datetime = strtotime($input);
        
$timezoneOffset = null;
    }
    return array(
$datetime, $timezoneOffset);
}

function
parseDateRFC822($input) {
    
// returns the date in SQL (MySQL, at least)-compatible text format
    
return strftime('%Y-%m-%d %H:%M:%S', strtotime($input));
}

function
parseDateRFC822TZ($input) {
    
/*
    Returns a two-element array.
    1st element: the datetime in the original local timezone,
        in seconds since the epoch.
    2nd element: the timezone offset from GMT in minutes,
        or null if unknown. West is negative.
    
    There are 3 time zones involved:
    
    "Computer time": as translated to the computer's local timezone.
    "Input time": as written in the input string.
    GMT: as translated to UTC.
    
    We want to get both "input time" and the input timezone's offset
    from GMT. strtotime, however, returns "computer time", which we're
    not interested in. (We may be on running a server where the local
    timezone setting is irrelevant. "Input time" preserves the
    author's time zone.)
    
    So we strip off the timezone information when parsing the datetime
    itself, disabling strtotime's timezone correction and ensuring that
    the result stays in "input time".
    
    If the timezone offset is readily available in the string (as either
    a numeric offset or "GMT", rather than some other text
    abbreviation), we use that.
    
    Otherwise, we calculate the timezone offset from GMT by taking the
    difference of strtotime's results for the original time with
    timezone and the same time marked 'GMT'. So it doesn't matter that
    strtotime has offset both of them to the computer's time zone--the
    offset drops out of the formula.
    
    Testing shows that this still works even if the two time strings
    used to calculate the GMT offset (marked as "input time" vs. "GMT")
    happen to straddle the DST boundary in "computer time".
    
    */
    
$input = trim($input);
    
$computerDatetime = strtotime($input);
    
$timezoneOffset = null;
    if (
preg_match('/[+-][0-9]{4}$/', $input)) {
        
// strip off a trailing numeric timezone
        
$inputWithoutTZ = rtrim(substr($input, 0, -5));
        
$timezoneOffset = intval(substr($input, -5, 3), 10) * 60 + intval(substr($input, -2, 2), 10);
    }
    elseif (
substr($input, -3) == 'GMT') {
        
// GMT abbrev
        
$inputWithoutTZ = rtrim(substr($input, 0, -4));
        
$timezoneOffset = 0;
    }
    elseif (
preg_match('/ [A-Z]{3}$/', $input)) {
        
// strip off a trailing textual timezone
        
$inputWithoutTZ = substr($input, 0, -4);
    }
    else {
        
$inputWithoutTZ = '';   // can't compute it
    
}
    if (
strlen($inputWithoutTZ)) {
        
$inputDatetime = strtotime($inputWithoutTZ);
        if (
$timezoneOffset === null) { // don't know the GMT offset yet; probably a text timezone
            
$asGMTDatetime = strtotime($inputWithoutTZ . ' +0000');
            
$timezoneOffset = ($asGMTDatetime - $computerDateTime)/60;
        }
    }
    else {
        
/* Bad news. Couldn't find timezone indicator to strip off, so
        we can't calc timezone offset from GMT. $computerDateTime will
        probably be in "input time" already, assuming strtotime couldn't
        find a timezone either. If it could, we'll just end up returning
        "computer time". */
        
$inputDatetime = $computerDatetime;
    }
        
    
//echo "From $input, made " . strftime('%Y-%m-%d %H:%M:%S', $inputDatetime) . " with tz offset $timezoneOffset.";
    
return array($inputDatetime, $timezoneOffset);
}

function
printDateSQL($input) {
    return
strftime('%Y-%m-%d %H:%M:%S', $input);
}

function
startElement($parser, $name, $attrs) {
    global
$currentPost, $currentText, $currentGuidAttrs;
    if (
$name == 'item') {
        
$currentPost = new post();
        
$currentPost->categories = array();
    }
    elseif (
$name == 'guid') {
        
$currentGuidAttrs = $attrs;
    }
    
$currentText = '';
}

function
endElement($parser, $name) {
    global
$currentPost, $currentText;
    
    switch (
$name) {
        case
'title': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# title':
            
$currentPost->title = $currentText;
            break;
        
        case
'content:encoded': case 'http://purl.org/rss/1.0/modules/content/ encoded':
            
$currentPost->content = $currentText;
            break;
            
        case
'description': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# description':
            
// content:encoded trumps description, so only save the description
            // if there's no content already
            
if (!isset($currentPost->content) || !strlen($currentPost->content)) {
                
$currentPost->content = $currentText;
            }
            break;
        
        case
'pubDate':
            list(
$currentPost->createDate, $currentPost->createDateTZ) = parseDateRFC822TZ($currentText);
            break;
        
        case
'dc:date': case 'http://purl.org/dc/elements/1.1/ date':
            list(
$currentPost->createDate, $currentPost->createDateTZ) = parseDateISO8601TZ($currentText);
            break;
        
        case
'dcterms:modified': case 'http://purl.org/dc/terms/ modified':
            list(
$currentPost->modDate, $currentPost->modDateTZ) = parseDateISO8601TZ($currentText);
            break;
        
        case
'category': case 'dc:subject': case 'http://purl.org/dc/elements/1.1/ subject':
            
$currentPost->categories[] = $currentText;
            break;
        
        case
'guid':
            if (isset(
$currentGuidAttrs['isPermaLink']) && $currentGuidAttrs['isPermaLink'] == 'true') {
                
$currentPost->permalink = $currentText;
            }
            
$currentPost->guid = $currentText;
            break;
        
        case
'item': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# item':
            
processPost($currentPost);
            
$currentPost = null;
            break;
    }
    
    
$currentText = '';
}

function
characterData($parser, $data) {
    global
$currentText;
    
$currentText .= $data;
}



// WordPress-specific code


$post_author = 'admin';

require_once(
'../wp-config.php');
require_once(
ABSPATH.WPINC.'/template-functions.php');
require_once(
ABSPATH.WPINC.'/functions.php');
require_once(
ABSPATH.WPINC.'/vars.php');

function
processPost(&$post) {
    global
$kSetModDateField, $kUpdatePostsAlways, $kUpdatePostsIfNewer, $kTakeNoAction;
    
    
//print_r($post);
    
    // Filter out (ignore) posts having categories that are all listed as "excluded"
    // If a post has no categories, or at least one non-excluded category, it is still
    // included.
    
if (sizeof($post->categories)) {
        
$gotIncludedCategory = false;
        foreach (
$post->categories as $categoryName) {
            if (!isset(
$kExcludedCategories[$categoryName])) {
                
$gotIncludedCategory = true;
                break;
            }
        }
        if (!
$gotIncludedCategory) {
            return;
        }
    }
    
    global
$post_author, $kExcludeCategories;
    global
$wpdb;
    global
$tableusers, $tableposts, $tablepost2cat, $tablecategories;

    
$post_author_ID = $wpdb->get_var("SELECT ID FROM $tableusers WHERE user_login = '".mysql_escape_string($post_author)."'");
    
$post_content = $post->content; // used to use addslashes($post_content);
    
$post_content = str_replace('<br>', '<br />', $post_content); // XHTMLify <br> tags
    
    /* Un-word-wrap the content, because <br /> tags will be added at display time
    for line breaks, and RSS feeds are often already soft-wrapped. Replace \n and \r
    with spaces.
    
    However, we don't want to remove word wrapping inside <pre> tags. Stopping short
    of a full HTML parser, we only un-wrap those sections not inside <pre> tag pairs.
    (This code could be misled by things that look like <pre> tags wrapped in HTML comments,
    but oh well.)
    */
    /*$pos = $lastpos = 0;
    while ($lastpos !== false && ($pos = strpos($post_content, '<pre>', $lastpos)) !== false) {
        $post_content = substr($post_content, 0, $lastpos)
            . str_replace("\n", ' ', str_replace("\r", ' ', substr($post_content, $lastpos, $pos - $lastpos)))
            . substr($post_content, $pos);
        $lastpos = strpos($post_content, '</pre>', $pos);
    }
    if ($lastpos !== false) {
        $post_content = substr($post_content, 0, $lastpos)
            . str_replace("\n", ' ', str_replace("\r", ' ', substr($post_content, $lastpos)));
    }
    */
    
    #$post_content = str_replace("\r", ' ', $post_content);
    #$post_content = str_replace("\n", ' ', $post_content);
    
    
global $kDefaultTimezoneOffset;
    if (
$post->createDateTZ === null) {
        
// no timezone info; assume it's in local time and fall back to a GMT offset from somewhere else
        
$offsetForLocal = 0;
        
$offsetForGMT = $post->modDateTZ ? -$post->modDateTZ : -$kDefaultTimezoneOffset;
    }
    elseif (
$post->createDateTZ == 0) {
        
// we have GMT; try to offset it back to local time if an offset if available anywhere else
        
$offsetForLocal = $post->modDateTZ ? $post->modDateTZ : $kDefaultTimezoneOffset;
        
$offsetForGMT = 0;
    }
    else {
        
$offsetForLocal = 0;
        
$offsetForGMT = -$post->createDateTZ;
    }
    
$post_date = printDateSQL($post->createDate + 60*$offsetForLocal);
    
$post_date_gmt = printDateSQL($post->createDate + 60*$offsetForGMT);
    
    if (!
$post->modDate) {
        
// no modified date; use the create date
        
$post_modifed = $post_date;
        
$post_modified_gmt = $post_date_gmt;
    }
    else {
        if (
$post->modDateTZ === null) {
            
// no timezone info; assume it's in local time and fall back to a GMT offset from somewhere else
            
$offsetForLocal = 0;
            
$offsetForGMT = $post->createDateTZ ? -$post->createDateTZ : -$kDefaultTimezoneOffset;
        }
        elseif (
$post->modDateTZ == 0) {
            
// we have GMT; try to offset it back to a local time if an offset is available anywhere else
            
$offsetForLocal = $post->createDateTZ ? $post->createDateTZ : $kDefaultTimezoneOffset;
            
$offsetForGMT = 0;
        }
        else {
            
$offsetForLocal = 0;
            
$offsetForGMT = -$post->modDateTZ;
        }
        
$post_modified = printDateSQL($post->modDate + 60*$offsetForLocal);
        
$post_modified_gmt = printDateSQL($post->modDate + 60*$offsetForGMT);
    }
    
    
$post_title = $post->title; // used to be addslashes($post->title);
    
$post_name = '';
    if (isset(
$post->permalink) && strlen($post->permalink)) {
        
// try to find a name for the post, trailing from the permalink
        
$matches = array();
        if (
preg_match('|/[0-9]{4}/[0-9]{2}/[0-9]{2}/([A-Za-z0-9_-]*)/?|', $post->permalink, $matches)) {
            
$post_name = $matches[1];
            
$post_name = mysql_escape_string($post_name);
        }
    }
        
    
    
$categoryIDList = array();
    foreach (
$post->categories as $categoryName) {
        if (isset(
$kExcludedCategories[$categoryName])) {
            continue;
        }
        
$categoryID = $wpdb->get_var("SELECT cat_ID FROM $tablecategories WHERE cat_name = '".mysql_escape_string($categoryName)."'");
        if (!
$categoryID) {
            if (
$kTakeNoAction) {
                echo
"Would have inserted new category '$categoryName'.";
                
$categoryID = 0;
            }
            else {
                
$categoryNiceName = sanitize_title($categoryName);
                
$wpdb->query("INSERT INTO $tablecategories
                    (cat_name, category_nicename)
                  VALUES
                    ('"
.mysql_escape_string($categoryName)."',"
                   
."'".mysql_escape_string($categoryNiceName)."')");
                
$categoryID = $wpdb->get_var("SELECT LAST_INSERT_ID()");
            }
        }
        else {
            
// category already exists; could update its nicename here if it tended not to be correct already.
            //$wpdb->query("UPDATE $tablecategories SET category_nicename = '".mysql_escape_string(sanitize_title($categoryName))."' WHERE cat_ID = ".intval($categoryID));
        
}
        
$categoryIDList[] = $categoryID;
    }
    
    
    print
"<br />\n\n";
    print
"Post: '" . htmlspecialchars($post_title) . "'<br />\n";
    print
"Timestamp: $post_date ($post_date_gmt GMT)<br />\n";
    if (
$post_modified && $post_date_gmt != $post_modified_gmt) {
        print
"Modified: $post_modified ($post_modified_gmt GMT)<br />\n";
    }
    
    
// Quick-n-dirty check for dups:
    
$dupcheck = $wpdb->get_results(
         
"SELECT ID,post_date,post_title,post_modified_gmt"
        
." FROM $tableposts"
        
." WHERE post_date='".mysql_escape_string($post_date)."'"
        
." AND post_title='".mysql_escape_string($post_title)."'"
        
." LIMIT 1",ARRAY_A);
    if (
$dupcheck[0]['ID']) {
        
// post already exists
        
if ($kUpdatePostsAlways || ($kUpdatePostsIfNewer && $post->modified && $dupcheck[0]['post_modified_gmt'] < $post_modified_gmt)) {
            print
"Updating post, ID = '" . $dupcheck[0]['ID'] . "'<br />\n";
            print
"Old version: ".$dupcheck[0]['post_modified']."; new version: $post_modified.<br />\n";
            if (!
$kTakeNoAction) {
                
$postID = $dupcheck[0]['ID'];
                
$result = $wpdb->query("
                    UPDATE $tableposts
                    SET post_author = '"
.mysql_escape_string($post_author_ID)."',"
                    
/*." post_date = '".mysql_escape_string($post_date)."',"*/
                    
. ($kSetModDateField ?
                         
" post_modified = '".mysql_escape_string($post_modified)."',"
                        
." post_modified_gmt = '".mysql_escape_string($post_modified_gmt)."',"
                        
: "")
                    .
" post_content = '".mysql_escape_string($post_content)."',"
                    
." post_title = '".mysql_escape_string($post_title)."',"
                    
." post_name = '".mysql_escape_string($post_name)."'"
                    
." WHERE ID = ".intval($postID));
                
$result = $wpdb->query("DELETE FROM $tablepost2cat WHERE post_id = ".intval($postID));
                foreach (
$categoryIDList as $categoryID) {
                    
$result = $wpdb->query("
                        INSERT INTO $tablepost2cat
                            (post_id, category_id)
                        VALUES
                            ("
.intval($postID).",".intval($categoryID).")
                        "
);
                }
            }
        }
        else {  
            print
"Skipping duplicate post, ID = '" . $dupcheck[0]['ID'] . "'<br />\n";
            if (!
$kUpdatePostsAlways && $kUpdatePostsIfNewer && !$post->modified) {
                print
"Warning: modification date was not present in the RSS file, so no check for updates can be performed.<br />\n";
            }
        }
    }
    else {
        print
"Inserting new post.<br />\n";
        if (!
$kTakeNoAction) {
            
$fieldValues = array($post_author,$post_date,$post_date_gmt,
                
$post_content,$post_title,$post_name,'1');
            if (
$kSetModDate) {
                
$fieldValues = array_merge($fieldValues, array($post_modified, $post_modified_gmt));
            }
            
$result = $wpdb->query(
                 
"INSERT INTO $tableposts"
                
." (post_author,post_date,post_date_gmt,"
                
."  post_content,post_title,post_name,post_category"
                
.($kSetModDate ? ",post_modified,post_modified_gmt" : "")
                .
") VALUES ('"
                
.join(array_map('mysql_escape_string', $fieldValues), "','")
                
//','$post_date','$post_date_gmt','$post_content','$post_title','$post_name','1'"
                //.($kSetModDate ? ",'$post_modified','$post_modified_gmt'" : "")
                
."')"
            
);
            
$postID = $wpdb->get_var("SELECT LAST_INSERT_ID();");
            if (
$postID) {
                foreach (
$categoryIDList as $categoryID) {
                    
$result = $wpdb->query("
                    INSERT INTO $tablepost2cat
                        (post_id, category_id)
                    VALUES
                        ("
.intval($postID).",".intval($categoryID).")
                    "
);
                }
            }
        }
    }
}

// XML parsing code
function importRSSFile($filePath) {
    if (
function_exists('xml_parser_create_ns')) {
        
$xml_parser = xml_parser_create_ns('iso-8859-1',' ');    // space sep for namespace URI
    
}
    else {
        
$xml_parser = xml_parser_create();
    }
    
// make sure to turn off case-folding; XML 1.0 is case-sensitive
    
xml_parser_set_option($xml_parser, XML_OPTION_CASE_FOLDING, false);
    
xml_set_element_handler($xml_parser, "startElement", "endElement");
    
xml_set_character_data_handler($xml_parser, "characterData");
    if (!(
$fp = fopen($filePath, "r"))) {
        die(
"could not open XML input");
    }
    
    while (
$data = fread($fp, 4096)) {
        if (!
xml_parse($xml_parser, $data, feof($fp))) {
            die(
sprintf("XML error: %s at line %d",
                        
xml_error_string(xml_get_error_code($xml_parser)),
                        
xml_get_current_line_number($xml_parser)));
        }
    }
    
xml_parser_free($xml_parser);
    
fclose($fp);
}

function
importBlogArchive($dirPath) {
    
$startYear = 1990;
    
$endYear = intval(strftime('%Y'));
    for (
$testYear = $startYear; $testYear <= $endYear; $testYear++) {
        for (
$testMonth = 1; $testMonth <= 12; $testMonth++) {
            
$rssFilePath = $dirPath.'/'.$testYear.'/'.($testMonth < 10 ? '0' : '').$testMonth.'.xml';
            
//if (is_file($rssFilePath)) {
                
@importRSSFile($rssFilePath);
            
//}
        
}
    }
}


if (
substr($path, -1) == '/' || is_dir($path)) {
    
importBlogArchive($path);
}
else {
    
importRSSFile($path);
}

/*echo '<pre>';
print_r($EZSQL_ERROR);
echo '</pre>';
*/

?>