Intro

This patch against Tavi 0.26 aims at resolving the problems discussed at TaviBugs/InternationalCharacters.

PHP got better Unicode support since 4.4. There's the /u modifier [1] for regexes and with \p [2] you can match characters with certain attributes, such as case. So this patch does mainly this: Set the charset to utf-8, update the WikiName pattern and add /u to all regexes where this pattern is used.

Comments

The patch

Add this to your config.php:


// Set $Charset to indicate the character set used for storage, editing,
//   and display in your wiki.  The default is "ISO-8859-1" (Latin-1).
//   "utf-8" is supported, and is recommended for international text;
//   however you should be cautioned that Netscape does not behave correctly
//   when editing utf-8 text.  Hence, "utf-8" is not currently the default.
$Charset = 'utf-8';

// The following variables establish the format for WikiNames in this wiki.
$UpperPtn = "\p{Lu}";
$LowerPtn = "\p{Ll}";
$AlphaPtn = "\p{L}";
$LinkPtn = $UpperPtn . $AlphaPtn . '*' . $LowerPtn . '+' .
           $UpperPtn . $AlphaPtn . '*(?:(?:\\/' . $UpperPtn . $AlphaPtn . '*)+)?';

Patch (cd tavi/installation; patch -p1 < path/to/patch.file) the rest of the installation with this:


diff -burN release-0.26/lib/defaults.php utf8/lib/defaults.php
--- release-0.26/lib/defaults.php    2006-10-21 23:20:13.854956568 +0200
+++ utf8/lib/defaults.php    2006-10-21 23:55:13.840710208 +0200
@@ -26,7 +26,7 @@
            "(?:[^ |\\/\"\']*\\/)*[^ |\\t\\n\\/\"\']*[A-Za-z0-9\\/?=&~_]";
 
 // $InterWikiPtn establishes the format for InterWiki links in this wiki.
-$InterwikiPtn = "([A-Za-z0-9\xc0-\xfe]+):" .
+$InterwikiPtn = "([\p{L}\p{N}]+):" .
                 "((?:[^ |\\/\"\']*\\/)*[^ |\\t\\n\\/\"\']*[\\/=&~A-Za-z0-9])";
 // Note: To avoid side effect of using parentheses in both $LinkPtn, $UrlPtn and
 //       $InterwikiPtn the special syntax (?: is used. This hides the parentheses
diff -burN release-0.26/lib/init.php utf8/lib/init.php
--- release-0.26/lib/init.php    2006-10-21 23:20:13.853956720 +0200
+++ utf8/lib/init.php    2006-10-21 23:21:44.735140680 +0200
@@ -16,7 +16,7 @@
 $RtTbl = $DBTablePrefix . 'rate';
 $RemTbl = $DBTablePrefix . 'remote_pages';
 
-$FlgChr = chr(255);                     // Flag character for parse engine.
+$FlgChr = chr(1);                     // Flag character for parse engine.
 
 $pagestore = new PageStore();
 $db = $pagestore->dbh;
diff -burN release-0.26/parse/html.php utf8/parse/html.php
--- release-0.26/parse/html.php    2006-10-21 23:20:14.259895008 +0200
+++ utf8/parse/html.php    2006-10-21 23:21:48.266603816 +0200
@@ -439,9 +439,9 @@
 
   if(validate_page($page) != 1)
     { return $page; }
-  $page = preg_replace("/(?<=$UpperPtn|$LowerPtn)($UpperPtn$LowerPtn)/",
-                       ' \\1', $page, -1);
-  $page = preg_replace("/($LowerPtn)($UpperPtn)/",
+  $page = preg_replace("/($UpperPtn)($UpperPtn$LowerPtn)/u",
+                        '\\1 \\2', $page, -1);
+  $page = preg_replace("/($LowerPtn)($UpperPtn)/u",
                        '\\1 \\2', $page, -1);
   return $page;
 }
diff -burN release-0.26/parse/macros.php utf8/parse/macros.php
--- release-0.26/parse/macros.php    2006-10-21 23:20:14.260894856 +0200
+++ utf8/parse/macros.php    2006-10-21 23:21:48.967497264 +0200
@@ -290,7 +290,7 @@
   // Check for illegal characters to make search pattern safer against exploits
   if ($search == '*') {  // Match every title 
     $pattern = "."; 
-  } else if ( !preg_match("/^\^?(\/|$AlphaPtn|[-_0-9:;\*\(\)])+\\$?$/", $search)) {
+  } else if ( !preg_match("/^\^?(\/|$AlphaPtn|[-_0-9:;\*\(\)])+\\$?$/u", $search)) {
      // Search can be locked at ^start and/or end$, contain alphanumeric
      // characters, or characters: :;-_
      // In addition the characters: (*)  have special syntactic meanings
diff -burN release-0.26/parse/save.php utf8/parse/save.php
--- release-0.26/parse/save.php    2006-10-21 23:20:14.259895008 +0200
+++ utf8/parse/save.php    2006-10-21 23:21:46.693842912 +0200
@@ -67,7 +67,7 @@
     $called = 1;
   }
 
-  if(preg_match('/^\*InterWiki:\s+\!?([A-Za-z\xc0-\xfe0-9]+)\s+((?:https?|file):[^\s]+)/',
+  if(preg_match('/^\*InterWiki:\s+\!?('.$AlphaPtn.'+)\s+((?:https?|file):[^\s]+)/u',
                 $text, $result))
   {
     $pagestore->new_interwiki($page, $result[1], $result[2]);
diff -burN release-0.26/parse/transforms.php utf8/parse/transforms.php
--- release-0.26/parse/transforms.php    2006-10-21 23:20:14.259895008 +0200
+++ utf8/parse/transforms.php    2006-10-21 23:21:47.326746696 +0200
@@ -109,9 +109,9 @@
   if(!$EnableWikiLinks) { return $text; }
 
   if($validate)
-    { $ptn = "/(^|[^A-Za-z])(\\/?$LinkPtn)(())(\"\")?/e"; }
+    { $ptn = "/(^|[^A-Za-z])(\\/?$LinkPtn)(())(\"\")?/ue"; }
   else
-    { $ptn = "/(^|[^A-Za-z])(!?\\/?$LinkPtn)((\#[A-Za-z]([-A-Za-z0-9_:.]*[-A-Za-z0-9_])?)?)(\"\")?/e"; }
+    { $ptn = "/(^|[^A-Za-z])(!?\\/?$LinkPtn)((\#[A-Za-z]([-A-Za-z0-9_:.]*[-A-Za-z0-9_])?)?)(\"\")?/ue"; }
 
   return preg_replace($ptn,
                       "q1('\\1').wikiname_token(q1('\\2'),'\\3')",
@@ -192,7 +192,7 @@
 {
   global $InterwikiPtn;
 
-  return preg_replace("/(^|[^A-Za-z])($InterwikiPtn)(?=\$|[^\\/=&~A-Za-z0-9])/e",
+  return preg_replace("/(^|[^A-Za-z])($InterwikiPtn)(?=\$|[^\\/=&~A-Za-z0-9])/ue",
                       "q1('\\1').interwiki_token(q1('\\3'),q1('\\4')).q1('\\5')",
                       $text, -1);
 }
@@ -213,7 +213,7 @@
 {
   global $UrlPtn,$InterwikiPtn;
 
-  return preg_replace("/\\[($UrlPtn|$InterwikiPtn)]/Ue",
+  return preg_replace("/\\[($UrlPtn|$InterwikiPtn)]/Uue",
                       "url_token(q1('\\1'), '')", $text, -1);
 }
 function image_search($text) 
@@ -227,7 +227,7 @@
 function parse_hyperlink_description($text)
 {
   global $UrlPtn, $InterwikiPtn;
-  return preg_replace("/\\[($UrlPtn|$InterwikiPtn) ([^]]+)]/e",
+  return preg_replace("/\\[($UrlPtn|$InterwikiPtn) ([^]]+)]/ue",
                       "url_token(q1('\\1'),image_search(q1('\\4')))", 
                       $text, -1);
 }
@@ -236,7 +236,7 @@
 {
   global $UrlPtn, $InterwikiPtn;
 
-  return preg_replace("/(^|[^A-Za-z])($UrlPtn|$InterwikiPtn)(?=\$|[^\\/?=&~A-Za-z0-9])/e",
+  return preg_replace("/(^|[^A-Za-z])($UrlPtn|$InterwikiPtn)(?=\$|[^\\/?=&~A-Za-z0-9])/ue",
                       "q1('\\1').url_token(q1('\\2'), q1('\\2')).q1('\\5')", $text, -1);
 }
 
@@ -246,7 +246,7 @@
   static $count = 1;
   // Expand interwiki-entry, if necessary
   if ((!preg_match("/$UrlPtn/", $value)) and
-      preg_match("/$InterwikiPtn/", $value, $match))  
+      preg_match("/$InterwikiPtn/u", $value, $match))  
   {
      $couldBeImage=($display==$value);
      if (($url=$pagestore->interwiki($match[1])) != '')