From 27f06b7b66081a3371604eb68dcc36308d9cf0c0 Mon Sep 17 00:00:00 2001 From: Benjamin Renard Date: Fri, 8 Oct 2021 17:20:31 +0200 Subject: [PATCH] generate_lang_file CLI command: add -F/--fix-utf8 parameter Add an option to allow to try to load and fix existing translation files with broken UTF-8 characters. --- src/includes/class/class.LSlang.php | 23 +- src/includes/libs/ForceUTF8/Encoding.php | 351 +++++++++++++++++++++++ 2 files changed, 372 insertions(+), 2 deletions(-) create mode 100644 src/includes/libs/ForceUTF8/Encoding.php diff --git a/src/includes/class/class.LSlang.php b/src/includes/class/class.LSlang.php index 2d73dfdf..e45f17d9 100644 --- a/src/includes/class/class.LSlang.php +++ b/src/includes/class/class.LSlang.php @@ -260,6 +260,7 @@ function cli_generate_lang_file($command_args) { $output = False; $additionalfileformat = False; $keep_unused = False; + $fix_utf8 = False; $lang = null; $encoding = null; $available_formats = array('php', 'pot'); @@ -370,6 +371,11 @@ function cli_generate_lang_file($command_args) { $keep_unused = true; break; + case '--fix-utf8': + case '-F': + $fix_utf8 = true; + break; + default: $path = relative2absolute_path($command_args[$i]); if (is_file($path)) @@ -489,12 +495,19 @@ function cli_generate_lang_file($command_args) { } } + // In fix-utf8 mode, load ForceUT8/Encoding lib + if ($fix_utf8) + LSsession :: includeFile(LS_LIB_DIR . "ForceUTF8/Encoding.php"); + // Load translation files foreach($load_files as $path) { $LSlang_cli_logger -> debug("Load $path lang file"); @include($path); foreach($GLOBALS['LSlang'] as $msg => $trans) { - $translations[$msg]=$trans; + if ($fix_utf8) + $translations[\ForceUTF8\Encoding::fixUTF8($msg)] = \ForceUTF8\Encoding::fixUTF8($trans); + else + $translations[$msg] = $trans; } } @@ -504,7 +517,10 @@ function cli_generate_lang_file($command_args) { // Load lang string if lang was specify if ($lang && $encoding && isset($GLOBALS['LSlang']) && is_array($GLOBALS['LSlang'])) { foreach($GLOBALS['LSlang'] as $msg => $trans) { - $translations[$msg] = $trans; + if ($fix_utf8) + $translations[\ForceUTF8\Encoding::fixUTF8($msg)] = \ForceUTF8\Encoding::fixUTF8($trans); + else + $translations[$msg] = $trans; } } @@ -980,6 +996,7 @@ function cli_generate_lang_file_args_autocompleter($comp_words, $comp_word_num, '-f', '--format', '-I', '--include-upstream', '-K', '--keep-unused', + '-F', '--fix-utf8', ) ); return LScli :: autocomplete_opts($opts, $comp_word); @@ -1009,6 +1026,8 @@ LScli :: add_command( " -f/--format Output file format : php or pot", " (default: php)", " -K/--keep-unused Keep unused translations in resulting file", + " -F/--fix-utf8 Try to load and fix broken UTF-8 characters in", + " existing lang files." ), false, // This command does not need LDAP connection 'cli_generate_lang_file_args_autocompleter' diff --git a/src/includes/libs/ForceUTF8/Encoding.php b/src/includes/libs/ForceUTF8/Encoding.php new file mode 100644 index 00000000..20315929 --- /dev/null +++ b/src/includes/libs/ForceUTF8/Encoding.php @@ -0,0 +1,351 @@ + + * @package Encoding + * @version 2.0 + * @link https://github.com/neitanod/forceutf8 + * @example https://github.com/neitanod/forceutf8 + * @license Revised BSD + */ + +namespace ForceUTF8; + +class Encoding { + + const ICONV_TRANSLIT = "TRANSLIT"; + const ICONV_IGNORE = "IGNORE"; + const WITHOUT_ICONV = ""; + + protected static $win1252ToUtf8 = array( + 128 => "\xe2\x82\xac", + + 130 => "\xe2\x80\x9a", + 131 => "\xc6\x92", + 132 => "\xe2\x80\x9e", + 133 => "\xe2\x80\xa6", + 134 => "\xe2\x80\xa0", + 135 => "\xe2\x80\xa1", + 136 => "\xcb\x86", + 137 => "\xe2\x80\xb0", + 138 => "\xc5\xa0", + 139 => "\xe2\x80\xb9", + 140 => "\xc5\x92", + + 142 => "\xc5\xbd", + + + 145 => "\xe2\x80\x98", + 146 => "\xe2\x80\x99", + 147 => "\xe2\x80\x9c", + 148 => "\xe2\x80\x9d", + 149 => "\xe2\x80\xa2", + 150 => "\xe2\x80\x93", + 151 => "\xe2\x80\x94", + 152 => "\xcb\x9c", + 153 => "\xe2\x84\xa2", + 154 => "\xc5\xa1", + 155 => "\xe2\x80\xba", + 156 => "\xc5\x93", + + 158 => "\xc5\xbe", + 159 => "\xc5\xb8" + ); + + protected static $brokenUtf8ToUtf8 = array( + "\xc2\x80" => "\xe2\x82\xac", + + "\xc2\x82" => "\xe2\x80\x9a", + "\xc2\x83" => "\xc6\x92", + "\xc2\x84" => "\xe2\x80\x9e", + "\xc2\x85" => "\xe2\x80\xa6", + "\xc2\x86" => "\xe2\x80\xa0", + "\xc2\x87" => "\xe2\x80\xa1", + "\xc2\x88" => "\xcb\x86", + "\xc2\x89" => "\xe2\x80\xb0", + "\xc2\x8a" => "\xc5\xa0", + "\xc2\x8b" => "\xe2\x80\xb9", + "\xc2\x8c" => "\xc5\x92", + + "\xc2\x8e" => "\xc5\xbd", + + + "\xc2\x91" => "\xe2\x80\x98", + "\xc2\x92" => "\xe2\x80\x99", + "\xc2\x93" => "\xe2\x80\x9c", + "\xc2\x94" => "\xe2\x80\x9d", + "\xc2\x95" => "\xe2\x80\xa2", + "\xc2\x96" => "\xe2\x80\x93", + "\xc2\x97" => "\xe2\x80\x94", + "\xc2\x98" => "\xcb\x9c", + "\xc2\x99" => "\xe2\x84\xa2", + "\xc2\x9a" => "\xc5\xa1", + "\xc2\x9b" => "\xe2\x80\xba", + "\xc2\x9c" => "\xc5\x93", + + "\xc2\x9e" => "\xc5\xbe", + "\xc2\x9f" => "\xc5\xb8" + ); + + protected static $utf8ToWin1252 = array( + "\xe2\x82\xac" => "\x80", + + "\xe2\x80\x9a" => "\x82", + "\xc6\x92" => "\x83", + "\xe2\x80\x9e" => "\x84", + "\xe2\x80\xa6" => "\x85", + "\xe2\x80\xa0" => "\x86", + "\xe2\x80\xa1" => "\x87", + "\xcb\x86" => "\x88", + "\xe2\x80\xb0" => "\x89", + "\xc5\xa0" => "\x8a", + "\xe2\x80\xb9" => "\x8b", + "\xc5\x92" => "\x8c", + + "\xc5\xbd" => "\x8e", + + + "\xe2\x80\x98" => "\x91", + "\xe2\x80\x99" => "\x92", + "\xe2\x80\x9c" => "\x93", + "\xe2\x80\x9d" => "\x94", + "\xe2\x80\xa2" => "\x95", + "\xe2\x80\x93" => "\x96", + "\xe2\x80\x94" => "\x97", + "\xcb\x9c" => "\x98", + "\xe2\x84\xa2" => "\x99", + "\xc5\xa1" => "\x9a", + "\xe2\x80\xba" => "\x9b", + "\xc5\x93" => "\x9c", + + "\xc5\xbe" => "\x9e", + "\xc5\xb8" => "\x9f" + ); + + static function toUTF8($text){ + /** + * Function \ForceUTF8\Encoding::toUTF8 + * + * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. + * + * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. + * + * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: + * + * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß + * are followed by any of these: ("group B") + * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿ + * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» + * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) + * is also a valid unicode character, and will be left unchanged. + * + * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, + * 3) when any of these: ðñòó are followed by THREE chars from group B. + * + * @name toUTF8 + * @param string $text Any string. + * @return string The same string, UTF8 encoded + * + */ + + if(is_array($text)) + { + foreach($text as $k => $v) + { + $text[$k] = self::toUTF8($v); + } + return $text; + } + + if(!is_string($text)) { + return $text; + } + + $max = self::strlen($text); + + $buf = ""; + for($i = 0; $i < $max; $i++){ + $c1 = $text[$i]; + if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already + $c2 = $i+1 >= $max? "\x00" : $text[$i+1]; + $c3 = $i+2 >= $max? "\x00" : $text[$i+2]; + $c4 = $i+3 >= $max? "\x00" : $text[$i+3]; + if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2; + $i++; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3; + $i = $i + 2; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3 . $c4; + $i = $i + 3; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } else { //doesn't look like UTF8, but should be converted + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = (($c1 & "\x3f") | "\x80"); + $buf .= $cc1 . $cc2; + } + } elseif(($c1 & "\xc0") === "\x80"){ // needs conversion + if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases + $buf .= self::$win1252ToUtf8[ord($c1)]; + } else { + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = (($c1 & "\x3f") | "\x80"); + $buf .= $cc1 . $cc2; + } + } else { // it doesn't need conversion + $buf .= $c1; + } + } + return $buf; + } + + static function toWin1252($text, $option = self::WITHOUT_ICONV) { + if(is_array($text)) { + foreach($text as $k => $v) { + $text[$k] = self::toWin1252($v, $option); + } + return $text; + } elseif(is_string($text)) { + return static::utf8_decode($text, $option); + } else { + return $text; + } + } + + static function toISO8859($text, $option = self::WITHOUT_ICONV) { + return self::toWin1252($text, $option); + } + + static function toLatin1($text, $option = self::WITHOUT_ICONV) { + return self::toWin1252($text, $option); + } + + static function fixUTF8($text, $option = self::WITHOUT_ICONV){ + if(is_array($text)) { + foreach($text as $k => $v) { + $text[$k] = self::fixUTF8($v, $option); + } + return $text; + } + + if(!is_string($text)) { + return $text; + } + + $last = ""; + while($last <> $text){ + $last = $text; + $text = self::toUTF8(static::utf8_decode($text, $option)); + } + $text = self::toUTF8(static::utf8_decode($text, $option)); + return $text; + } + + static function UTF8FixWin1252Chars($text){ + // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 + // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it. + // See: http://en.wikipedia.org/wiki/Windows-1252 + + return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text); + } + + static function removeBOM($str=""){ + if(substr($str, 0,3) === pack("CCC",0xef,0xbb,0xbf)) { + $str=substr($str, 3); + } + return $str; + } + + protected static function strlen($text){ + return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) ? + mb_strlen($text,'8bit') : strlen($text); + } + + public static function normalizeEncoding($encodingLabel) + { + $encoding = strtoupper($encodingLabel); + $encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding); + $equivalences = array( + 'ISO88591' => 'ISO-8859-1', + 'ISO8859' => 'ISO-8859-1', + 'ISO' => 'ISO-8859-1', + 'LATIN1' => 'ISO-8859-1', + 'LATIN' => 'ISO-8859-1', + 'UTF8' => 'UTF-8', + 'UTF' => 'UTF-8', + 'WIN1252' => 'ISO-8859-1', + 'WINDOWS1252' => 'ISO-8859-1' + ); + + if(empty($equivalences[$encoding])){ + return 'UTF-8'; + } + + return $equivalences[$encoding]; + } + + public static function encode($encodingLabel, $text) + { + $encodingLabel = self::normalizeEncoding($encodingLabel); + if($encodingLabel === 'ISO-8859-1') return self::toLatin1($text); + return self::toUTF8($text); + } + + protected static function utf8_decode($text, $option = self::WITHOUT_ICONV) + { + if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) { + $o = utf8_decode( + str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)) + ); + } else { + $o = iconv("UTF-8", "Windows-1252" . ($option === self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option === self::ICONV_IGNORE ? '//IGNORE' : '')), $text); + } + return $o; + } +}