''); error_reporting(E_ALL); class post { var $title; var $content; var $createDate; var $createDateTZ; var $modDate; var $modDateTZ; var $guid; var $categories; var $postTitle; } $currentPost = null; $currentText = ''; function parseDateISO8601($input) { // returns the date in SQL (MySQL, at least)-compatible text format return substr($input, 0, 10) . ' ' . substr($input, 11, 8); } function parseDateISO8601TZ($input) { // Returns an array. // Element 1: the datetime in the original local timezone (in secs since the epoch). // Element 2: the timezone offset from GMT in minutes, or null if unknown. West is negative. if (substr($input, 11) == 'T') { $datetime = strtotime(substr($input, 0, 10) . ' ' . substr($input, 11, 8)); if (strlen($input) < 19) { $timezoneOffset = null; } elseif (substr($input, 19, 1) == 'Z') { $timezoneOffset = 0; } else { $timezoneOffset = intval(substr($input, 19, 3), 10) + intval(substr($input, 23, 2), 10) / 60; } } else { // malformed W3CDTF; punt to strtotime $datetime = strtotime($input); $timezoneOffset = null; } return array($datetime, $timezoneOffset); } function parseDateRFC822($input) { // returns the date in SQL (MySQL, at least)-compatible text format return strftime('%Y-%m-%d %H:%M:%S', strtotime($input)); } function parseDateRFC822TZ($input) { /* Returns a two-element array. 1st element: the datetime in the original local timezone, in seconds since the epoch. 2nd element: the timezone offset from GMT in minutes, or null if unknown. West is negative. There are 3 time zones involved: "Computer time": as translated to the computer's local timezone. "Input time": as written in the input string. GMT: as translated to UTC. We want to get both "input time" and the input timezone's offset from GMT. strtotime, however, returns "computer time", which we're not interested in. (We may be on running a server where the local timezone setting is irrelevant. "Input time" preserves the author's time zone.) So we strip off the timezone information when parsing the datetime itself, disabling strtotime's timezone correction and ensuring that the result stays in "input time". If the timezone offset is readily available in the string (as either a numeric offset or "GMT", rather than some other text abbreviation), we use that. Otherwise, we calculate the timezone offset from GMT by taking the difference of strtotime's results for the original time with timezone and the same time marked 'GMT'. So it doesn't matter that strtotime has offset both of them to the computer's time zone--the offset drops out of the formula. Testing shows that this still works even if the two time strings used to calculate the GMT offset (marked as "input time" vs. "GMT") happen to straddle the DST boundary in "computer time". */ $input = trim($input); $computerDatetime = strtotime($input); $timezoneOffset = null; if (preg_match('/[+-][0-9]{4}$/', $input)) { // strip off a trailing numeric timezone $inputWithoutTZ = rtrim(substr($input, 0, -5)); $timezoneOffset = intval(substr($input, -5, 3), 10) * 60 + intval(substr($input, -2, 2), 10); } elseif (substr($input, -3) == 'GMT') { // GMT abbrev $inputWithoutTZ = rtrim(substr($input, 0, -4)); $timezoneOffset = 0; } elseif (preg_match('/ [A-Z]{3}$/', $input)) { // strip off a trailing textual timezone $inputWithoutTZ = substr($input, 0, -4); } else { $inputWithoutTZ = ''; // can't compute it } if (strlen($inputWithoutTZ)) { $inputDatetime = strtotime($inputWithoutTZ); if ($timezoneOffset === null) { // don't know the GMT offset yet; probably a text timezone $asGMTDatetime = strtotime($inputWithoutTZ . ' +0000'); $timezoneOffset = ($asGMTDatetime - $computerDateTime)/60; } } else { /* Bad news. Couldn't find timezone indicator to strip off, so we can't calc timezone offset from GMT. $computerDateTime will probably be in "input time" already, assuming strtotime couldn't find a timezone either. If it could, we'll just end up returning "computer time". */ $inputDatetime = $computerDatetime; } //echo "From $input, made " . strftime('%Y-%m-%d %H:%M:%S', $inputDatetime) . " with tz offset $timezoneOffset."; return array($inputDatetime, $timezoneOffset); } function printDateSQL($input) { return strftime('%Y-%m-%d %H:%M:%S', $input); } function startElement($parser, $name, $attrs) { global $currentPost, $currentText, $currentGuidAttrs; if ($name == 'item') { $currentPost = new post(); $currentPost->categories = array(); } elseif ($name == 'guid') { $currentGuidAttrs = $attrs; } $currentText = ''; } function endElement($parser, $name) { global $currentPost, $currentText; switch ($name) { case 'title': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# title': $currentPost->title = $currentText; break; case 'content:encoded': case 'http://purl.org/rss/1.0/modules/content/ encoded': $currentPost->content = $currentText; break; case 'description': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# description': // content:encoded trumps description, so only save the description // if there's no content already if (!isset($currentPost->content) || !strlen($currentPost->content)) { $currentPost->content = $currentText; } break; case 'pubDate': list($currentPost->createDate, $currentPost->createDateTZ) = parseDateRFC822TZ($currentText); break; case 'dc:date': case 'http://purl.org/dc/elements/1.1/ date': list($currentPost->createDate, $currentPost->createDateTZ) = parseDateISO8601TZ($currentText); break; case 'dcterms:modified': case 'http://purl.org/dc/terms/ modified': list($currentPost->modDate, $currentPost->modDateTZ) = parseDateISO8601TZ($currentText); break; case 'category': case 'dc:subject': case 'http://purl.org/dc/elements/1.1/ subject': $currentPost->categories[] = $currentText; break; case 'guid': if (isset($currentGuidAttrs['isPermaLink']) && $currentGuidAttrs['isPermaLink'] == 'true') { $currentPost->permalink = $currentText; } $currentPost->guid = $currentText; break; case 'item': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# item': processPost($currentPost); $currentPost = null; break; } $currentText = ''; } function characterData($parser, $data) { global $currentText; $currentText .= $data; } // WordPress-specific code $post_author = 'admin'; require_once('../wp-config.php'); require_once(ABSPATH.WPINC.'/template-functions.php'); require_once(ABSPATH.WPINC.'/functions.php'); require_once(ABSPATH.WPINC.'/vars.php'); function processPost(&$post) { global $kSetModDateField, $kUpdatePostsAlways, $kUpdatePostsIfNewer, $kTakeNoAction; //print_r($post); // Filter out (ignore) posts having categories that are all listed as "excluded" // If a post has no categories, or at least one non-excluded category, it is still // included. if (sizeof($post->categories)) { $gotIncludedCategory = false; foreach ($post->categories as $categoryName) { if (!isset($kExcludedCategories[$categoryName])) { $gotIncludedCategory = true; break; } } if (!$gotIncludedCategory) { return; } } global $post_author, $kExcludeCategories; global $wpdb; global $tableusers, $tableposts, $tablepost2cat, $tablecategories; $post_author_ID = $wpdb->get_var("SELECT ID FROM $tableusers WHERE user_login = '".mysql_escape_string($post_author)."'"); $post_content = $post->content; // used to use addslashes($post_content); $post_content = str_replace('
', '
', $post_content); // XHTMLify
tags /* Un-word-wrap the content, because
tags will be added at display time for line breaks, and RSS feeds are often already soft-wrapped. Replace \n and \r with spaces. However, we don't want to remove word wrapping inside
 tags. Stopping short
    of a full HTML parser, we only un-wrap those sections not inside 
 tag pairs.
    (This code could be misled by things that look like 
 tags wrapped in HTML comments,
    but oh well.)
    */
    /*$pos = $lastpos = 0;
    while ($lastpos !== false && ($pos = strpos($post_content, '
', $lastpos)) !== false) {
        $post_content = substr($post_content, 0, $lastpos)
            . str_replace("\n", ' ', str_replace("\r", ' ', substr($post_content, $lastpos, $pos - $lastpos)))
            . substr($post_content, $pos);
        $lastpos = strpos($post_content, '
', $pos); } if ($lastpos !== false) { $post_content = substr($post_content, 0, $lastpos) . str_replace("\n", ' ', str_replace("\r", ' ', substr($post_content, $lastpos))); } */ #$post_content = str_replace("\r", ' ', $post_content); #$post_content = str_replace("\n", ' ', $post_content); global $kDefaultTimezoneOffset; if ($post->createDateTZ === null) { // no timezone info; assume it's in local time and fall back to a GMT offset from somewhere else $offsetForLocal = 0; $offsetForGMT = $post->modDateTZ ? -$post->modDateTZ : -$kDefaultTimezoneOffset; } elseif ($post->createDateTZ == 0) { // we have GMT; try to offset it back to local time if an offset if available anywhere else $offsetForLocal = $post->modDateTZ ? $post->modDateTZ : $kDefaultTimezoneOffset; $offsetForGMT = 0; } else { $offsetForLocal = 0; $offsetForGMT = -$post->createDateTZ; } $post_date = printDateSQL($post->createDate + 60*$offsetForLocal); $post_date_gmt = printDateSQL($post->createDate + 60*$offsetForGMT); if (!$post->modDate) { // no modified date; use the create date $post_modifed = $post_date; $post_modified_gmt = $post_date_gmt; } else { if ($post->modDateTZ === null) { // no timezone info; assume it's in local time and fall back to a GMT offset from somewhere else $offsetForLocal = 0; $offsetForGMT = $post->createDateTZ ? -$post->createDateTZ : -$kDefaultTimezoneOffset; } elseif ($post->modDateTZ == 0) { // we have GMT; try to offset it back to a local time if an offset is available anywhere else $offsetForLocal = $post->createDateTZ ? $post->createDateTZ : $kDefaultTimezoneOffset; $offsetForGMT = 0; } else { $offsetForLocal = 0; $offsetForGMT = -$post->modDateTZ; } $post_modified = printDateSQL($post->modDate + 60*$offsetForLocal); $post_modified_gmt = printDateSQL($post->modDate + 60*$offsetForGMT); } $post_title = $post->title; // used to be addslashes($post->title); $post_name = ''; if (isset($post->permalink) && strlen($post->permalink)) { // try to find a name for the post, trailing from the permalink $matches = array(); if (preg_match('|/[0-9]{4}/[0-9]{2}/[0-9]{2}/([A-Za-z0-9_-]*)/?|', $post->permalink, $matches)) { $post_name = $matches[1]; $post_name = mysql_escape_string($post_name); } } $categoryIDList = array(); foreach ($post->categories as $categoryName) { if (isset($kExcludedCategories[$categoryName])) { continue; } $categoryID = $wpdb->get_var("SELECT cat_ID FROM $tablecategories WHERE cat_name = '".mysql_escape_string($categoryName)."'"); if (!$categoryID) { if ($kTakeNoAction) { echo "Would have inserted new category '$categoryName'."; $categoryID = 0; } else { $categoryNiceName = sanitize_title($categoryName); $wpdb->query("INSERT INTO $tablecategories (cat_name, category_nicename) VALUES ('".mysql_escape_string($categoryName)."'," ."'".mysql_escape_string($categoryNiceName)."')"); $categoryID = $wpdb->get_var("SELECT LAST_INSERT_ID()"); } } else { // category already exists; could update its nicename here if it tended not to be correct already. //$wpdb->query("UPDATE $tablecategories SET category_nicename = '".mysql_escape_string(sanitize_title($categoryName))."' WHERE cat_ID = ".intval($categoryID)); } $categoryIDList[] = $categoryID; } print "
\n\n"; print "Post: '" . htmlspecialchars($post_title) . "'
\n"; print "Timestamp: $post_date ($post_date_gmt GMT)
\n"; if ($post_modified && $post_date_gmt != $post_modified_gmt) { print "Modified: $post_modified ($post_modified_gmt GMT)
\n"; } // Quick-n-dirty check for dups: $dupcheck = $wpdb->get_results( "SELECT ID,post_date,post_title,post_modified_gmt" ." FROM $tableposts" ." WHERE post_date='".mysql_escape_string($post_date)."'" ." AND post_title='".mysql_escape_string($post_title)."'" ." LIMIT 1",ARRAY_A); if ($dupcheck[0]['ID']) { // post already exists if ($kUpdatePostsAlways || ($kUpdatePostsIfNewer && $post->modified && $dupcheck[0]['post_modified_gmt'] < $post_modified_gmt)) { print "Updating post, ID = '" . $dupcheck[0]['ID'] . "'
\n"; print "Old version: ".$dupcheck[0]['post_modified']."; new version: $post_modified.
\n"; if (!$kTakeNoAction) { $postID = $dupcheck[0]['ID']; $result = $wpdb->query(" UPDATE $tableposts SET post_author = '".mysql_escape_string($post_author_ID)."'," /*." post_date = '".mysql_escape_string($post_date)."',"*/ . ($kSetModDateField ? " post_modified = '".mysql_escape_string($post_modified)."'," ." post_modified_gmt = '".mysql_escape_string($post_modified_gmt)."'," : "") ." post_content = '".mysql_escape_string($post_content)."'," ." post_title = '".mysql_escape_string($post_title)."'," ." post_name = '".mysql_escape_string($post_name)."'" ." WHERE ID = ".intval($postID)); $result = $wpdb->query("DELETE FROM $tablepost2cat WHERE post_id = ".intval($postID)); foreach ($categoryIDList as $categoryID) { $result = $wpdb->query(" INSERT INTO $tablepost2cat (post_id, category_id) VALUES (".intval($postID).",".intval($categoryID).") "); } } } else { print "Skipping duplicate post, ID = '" . $dupcheck[0]['ID'] . "'
\n"; if (!$kUpdatePostsAlways && $kUpdatePostsIfNewer && !$post->modified) { print "Warning: modification date was not present in the RSS file, so no check for updates can be performed.
\n"; } } } else { print "Inserting new post.
\n"; if (!$kTakeNoAction) { $fieldValues = array($post_author,$post_date,$post_date_gmt, $post_content,$post_title,$post_name,'1'); if ($kSetModDate) { $fieldValues = array_merge($fieldValues, array($post_modified, $post_modified_gmt)); } $result = $wpdb->query( "INSERT INTO $tableposts" ." (post_author,post_date,post_date_gmt," ." post_content,post_title,post_name,post_category" .($kSetModDate ? ",post_modified,post_modified_gmt" : "") .") VALUES ('" .join(array_map('mysql_escape_string', $fieldValues), "','") //','$post_date','$post_date_gmt','$post_content','$post_title','$post_name','1'" //.($kSetModDate ? ",'$post_modified','$post_modified_gmt'" : "") ."')" ); $postID = $wpdb->get_var("SELECT LAST_INSERT_ID();"); if ($postID) { foreach ($categoryIDList as $categoryID) { $result = $wpdb->query(" INSERT INTO $tablepost2cat (post_id, category_id) VALUES (".intval($postID).",".intval($categoryID).") "); } } } } } // XML parsing code function importRSSFile($filePath) { if (function_exists('xml_parser_create_ns')) { $xml_parser = xml_parser_create_ns('iso-8859-1',' '); // space sep for namespace URI } else { $xml_parser = xml_parser_create(); } // make sure to turn off case-folding; XML 1.0 is case-sensitive xml_parser_set_option($xml_parser, XML_OPTION_CASE_FOLDING, false); xml_set_element_handler($xml_parser, "startElement", "endElement"); xml_set_character_data_handler($xml_parser, "characterData"); if (!($fp = fopen($filePath, "r"))) { die("could not open XML input"); } while ($data = fread($fp, 4096)) { if (!xml_parse($xml_parser, $data, feof($fp))) { die(sprintf("XML error: %s at line %d", xml_error_string(xml_get_error_code($xml_parser)), xml_get_current_line_number($xml_parser))); } } xml_parser_free($xml_parser); fclose($fp); } function importBlogArchive($dirPath) { $startYear = 1990; $endYear = intval(strftime('%Y')); for ($testYear = $startYear; $testYear <= $endYear; $testYear++) { for ($testMonth = 1; $testMonth <= 12; $testMonth++) { $rssFilePath = $dirPath.'/'.$testYear.'/'.($testMonth < 10 ? '0' : '').$testMonth.'.xml'; //if (is_file($rssFilePath)) { @importRSSFile($rssFilePath); //} } } } if (substr($path, -1) == '/' || is_dir($path)) { importBlogArchive($path); } else { importRSSFile($path); } /*echo '
';
print_r($EZSQL_ERROR);
echo '
'; */ ?>