This patch against Tavi 0.26 aims at resolving the problems discussed at TaviBugs/InternationalCharacters.
PHP got better Unicode support since 4.4. There's the /u modifier [1] for regexes and with \p [2] you can match characters with certain attributes, such as case. So this patch does mainly this: Set the charset to utf-8, update the WikiName pattern and add /u to all regexes where this pattern is used.
Add this to your config.php:
// Set $Charset to indicate the character set used for storage, editing,
// and display in your wiki. The default is "ISO-8859-1" (Latin-1).
// "utf-8" is supported, and is recommended for international text;
// however you should be cautioned that Netscape does not behave correctly
// when editing utf-8 text. Hence, "utf-8" is not currently the default.
$Charset = 'utf-8';
// The following variables establish the format for WikiNames in this wiki.
$UpperPtn = "\p{Lu}";
$LowerPtn = "\p{Ll}";
$AlphaPtn = "\p{L}";
$LinkPtn = $UpperPtn . $AlphaPtn . '*' . $LowerPtn . '+' .
$UpperPtn . $AlphaPtn . '*(?:(?:\\/' . $UpperPtn . $AlphaPtn . '*)+)?';
Patch (cd tavi/installation; patch -p1 < path/to/patch.file) the rest of the installation with this:
diff -burN release-0.26/lib/defaults.php utf8/lib/defaults.php
--- release-0.26/lib/defaults.php 2006-10-21 23:20:13.854956568 +0200
+++ utf8/lib/defaults.php 2006-10-21 23:55:13.840710208 +0200
@@ -26,7 +26,7 @@
"(?:[^ |\\/\"\']*\\/)*[^ |\\t\\n\\/\"\']*[A-Za-z0-9\\/?=&~_]";
// $InterWikiPtn establishes the format for InterWiki links in this wiki.
-$InterwikiPtn = "([A-Za-z0-9\xc0-\xfe]+):" .
+$InterwikiPtn = "([\p{L}\p{N}]+):" .
"((?:[^ |\\/\"\']*\\/)*[^ |\\t\\n\\/\"\']*[\\/=&~A-Za-z0-9])";
// Note: To avoid side effect of using parentheses in both $LinkPtn, $UrlPtn and
// $InterwikiPtn the special syntax (?: is used. This hides the parentheses
diff -burN release-0.26/lib/init.php utf8/lib/init.php
--- release-0.26/lib/init.php 2006-10-21 23:20:13.853956720 +0200
+++ utf8/lib/init.php 2006-10-21 23:21:44.735140680 +0200
@@ -16,7 +16,7 @@
$RtTbl = $DBTablePrefix . 'rate';
$RemTbl = $DBTablePrefix . 'remote_pages';
-$FlgChr = chr(255); // Flag character for parse engine.
+$FlgChr = chr(1); // Flag character for parse engine.
$pagestore = new PageStore();
$db = $pagestore->dbh;
diff -burN release-0.26/parse/html.php utf8/parse/html.php
--- release-0.26/parse/html.php 2006-10-21 23:20:14.259895008 +0200
+++ utf8/parse/html.php 2006-10-21 23:21:48.266603816 +0200
@@ -439,9 +439,9 @@
if(validate_page($page) != 1)
{ return $page; }
- $page = preg_replace("/(?<=$UpperPtn|$LowerPtn)($UpperPtn$LowerPtn)/",
- ' \\1', $page, -1);
- $page = preg_replace("/($LowerPtn)($UpperPtn)/",
+ $page = preg_replace("/($UpperPtn)($UpperPtn$LowerPtn)/u",
+ '\\1 \\2', $page, -1);
+ $page = preg_replace("/($LowerPtn)($UpperPtn)/u",
'\\1 \\2', $page, -1);
return $page;
}
diff -burN release-0.26/parse/macros.php utf8/parse/macros.php
--- release-0.26/parse/macros.php 2006-10-21 23:20:14.260894856 +0200
+++ utf8/parse/macros.php 2006-10-21 23:21:48.967497264 +0200
@@ -290,7 +290,7 @@
// Check for illegal characters to make search pattern safer against exploits
if ($search == '*') { // Match every title
$pattern = ".";
- } else if ( !preg_match("/^\^?(\/|$AlphaPtn|[-_0-9:;\*\(\)])+\\$?$/", $search)) {
+ } else if ( !preg_match("/^\^?(\/|$AlphaPtn|[-_0-9:;\*\(\)])+\\$?$/u", $search)) {
// Search can be locked at ^start and/or end$, contain alphanumeric
// characters, or characters: :;-_
// In addition the characters: (*) have special syntactic meanings
diff -burN release-0.26/parse/save.php utf8/parse/save.php
--- release-0.26/parse/save.php 2006-10-21 23:20:14.259895008 +0200
+++ utf8/parse/save.php 2006-10-21 23:21:46.693842912 +0200
@@ -67,7 +67,7 @@
$called = 1;
}
- if(preg_match('/^\*InterWiki:\s+\!?([A-Za-z\xc0-\xfe0-9]+)\s+((?:https?|file):[^\s]+)/',
+ if(preg_match('/^\*InterWiki:\s+\!?('.$AlphaPtn.'+)\s+((?:https?|file):[^\s]+)/u',
$text, $result))
{
$pagestore->new_interwiki($page, $result[1], $result[2]);
diff -burN release-0.26/parse/transforms.php utf8/parse/transforms.php
--- release-0.26/parse/transforms.php 2006-10-21 23:20:14.259895008 +0200
+++ utf8/parse/transforms.php 2006-10-21 23:21:47.326746696 +0200
@@ -109,9 +109,9 @@
if(!$EnableWikiLinks) { return $text; }
if($validate)
- { $ptn = "/(^|[^A-Za-z])(\\/?$LinkPtn)(())(\"\")?/e"; }
+ { $ptn = "/(^|[^A-Za-z])(\\/?$LinkPtn)(())(\"\")?/ue"; }
else
- { $ptn = "/(^|[^A-Za-z])(!?\\/?$LinkPtn)((\#[A-Za-z]([-A-Za-z0-9_:.]*[-A-Za-z0-9_])?)?)(\"\")?/e"; }
+ { $ptn = "/(^|[^A-Za-z])(!?\\/?$LinkPtn)((\#[A-Za-z]([-A-Za-z0-9_:.]*[-A-Za-z0-9_])?)?)(\"\")?/ue"; }
return preg_replace($ptn,
"q1('\\1').wikiname_token(q1('\\2'),'\\3')",
@@ -192,7 +192,7 @@
{
global $InterwikiPtn;
- return preg_replace("/(^|[^A-Za-z])($InterwikiPtn)(?=\$|[^\\/=&~A-Za-z0-9])/e",
+ return preg_replace("/(^|[^A-Za-z])($InterwikiPtn)(?=\$|[^\\/=&~A-Za-z0-9])/ue",
"q1('\\1').interwiki_token(q1('\\3'),q1('\\4')).q1('\\5')",
$text, -1);
}
@@ -213,7 +213,7 @@
{
global $UrlPtn,$InterwikiPtn;
- return preg_replace("/\\[($UrlPtn|$InterwikiPtn)]/Ue",
+ return preg_replace("/\\[($UrlPtn|$InterwikiPtn)]/Uue",
"url_token(q1('\\1'), '')", $text, -1);
}
function image_search($text)
@@ -227,7 +227,7 @@
function parse_hyperlink_description($text)
{
global $UrlPtn, $InterwikiPtn;
- return preg_replace("/\\[($UrlPtn|$InterwikiPtn) ([^]]+)]/e",
+ return preg_replace("/\\[($UrlPtn|$InterwikiPtn) ([^]]+)]/ue",
"url_token(q1('\\1'),image_search(q1('\\4')))",
$text, -1);
}
@@ -236,7 +236,7 @@
{
global $UrlPtn, $InterwikiPtn;
- return preg_replace("/(^|[^A-Za-z])($UrlPtn|$InterwikiPtn)(?=\$|[^\\/?=&~A-Za-z0-9])/e",
+ return preg_replace("/(^|[^A-Za-z])($UrlPtn|$InterwikiPtn)(?=\$|[^\\/?=&~A-Za-z0-9])/ue",
"q1('\\1').url_token(q1('\\2'), q1('\\2')).q1('\\5')", $text, -1);
}
@@ -246,7 +246,7 @@
static $count = 1;
// Expand interwiki-entry, if necessary
if ((!preg_match("/$UrlPtn/", $value)) and
- preg_match("/$InterwikiPtn/", $value, $match))
+ preg_match("/$InterwikiPtn/u", $value, $match))
{
$couldBeImage=($display==$value);
if (($url=$pagestore->interwiki($match[1])) != '')