Hi,
Ok I believe I have been able to answer my own question and have posted the following in the hope that it will help someone else! My problem was basically that the characters had not been defined and as a result some strange things were happening where characters were being reversed (because the algorithm thought that they were non-Arabic..). I have been coding in C# but the similar changes could be made to the C++ version.
N.B. I have included (what I believe to be) the lookup table entries for the Urdu and Farsi characters but have not done the Pashto/Kurdish/others (indicated with 'Not yet included')
The first change I made was to the lookupArray which can be tacked onto the end of the existing one:
{0x653, 0, 0, 0, 0},
{0x654, 0, 0, 0, 0},
{0x655, 0, 0, 0, 0},
{0x656, 0, 0, 0, 0},
{0x657, 0, 0, 0, 0},
{0x658, 0, 0, 0, 0},
{0x659, 0, 0, 0, 0},
{0x65A, 0, 0, 0, 0},
{0x65B, 0, 0, 0, 0},
{0x65C, 0, 0, 0, 0},
{0x65D, 0, 0, 0, 0},
{0x65E, 0, 0, 0, 0},
{0x65F, 0, 0, 0, 0},
{0x660, 0x660, 0x660, 0x660, 0x660},
{0x661, 0x661, 0x661, 0x661, 0x661},
{0x662, 0x662, 0x662, 0x662, 0x662},
{0x663, 0x663, 0x663, 0x663, 0x663},
{0x664, 0x664, 0x664, 0x664, 0x664},
{0x665, 0x665, 0x665, 0x665, 0x665},
{0x666, 0x666, 0x666, 0x666, 0x666},
{0x667, 0x667, 0x667, 0x667, 0x667},
{0x668, 0x668, 0x668, 0x668, 0x668},
{0x669, 0x669, 0x669, 0x669, 0x669},
{0x66a, 0x66a, 0x66a, 0x66a, 0x66a},
{0x66b, 0x66b, 0x66b, 0x66b, 0x66b},
{0x66c, 0x66c, 0x66c, 0x66c, 0x66c},
{0x66d, 0x66d, 0x66d, 0x66d, 0x66d},
{0x66E, 0, 0, 0, 0},
{0x66F, 0, 0, 0, 0},
{0x670, 0, 0, 0, 0},
{0x671, 0xfb51, 0xfb50, 0xfb51, 0xfb50},
{0x672, 0, 0, 0, 0},
{0x673, 0, 0, 0, 0},
{0x674, 0, 0, 0, 0},
{0x675, 0, 0, 0, 0},
{0x676, 0, 0, 0, 0},
{0x677, 0, 0, 0, 0},
{0x678, 0, 0, 0, 0},
{0x679, 0xfb67, 0xfb68, 0xfb69, 0xfb66},
{0x67A, 0, 0, 0, 0},
{0x67B, 0, 0, 0, 0},
{0x67c, 0x67c, 0x67c, 0x67c, 0x67c},
{0x67D, 0, 0, 0, 0},
{0x67e, 0xfb57, 0xfb58, 0xfb59, 0xfb56},
{0x67F, 0, 0, 0, 0},
{0x680, 0, 0, 0, 0},
{0x681, 0x681, 0x681, 0x681, 0x681},
{0x682, 0, 0, 0, 0},
{0x683, 0, 0, 0, 0},
{0x684, 0, 0, 0, 0},
{0x685, 0x685, 0x685, 0x685, 0x685},
{0x686, 0xfb7b, 0xfb7c, 0xfb7d, 0xfb7a},
{0x687, 0, 0, 0, 0},
{0x688, 0xfb89, 0xfb88, 0xfb89, 0xfb88},
{0x689, 0x689, 0x689, 0x689, 0x689},
{0x68A, 0, 0, 0, 0},
{0x68B, 0, 0, 0, 0},
{0x68C, 0, 0, 0, 0},
{0x68D, 0, 0, 0, 0},
{0x68E, 0, 0, 0, 0},
{0x68F, 0, 0, 0, 0},
{0x690, 0, 0, 0, 0},
{0x691, 0xfb8d, 0xfb8c, 0xfb8d, 0xfb8c},
{0x692, 0, 0, 0, 0},
{0x693, 0x693, 0x693, 0x693, 0x693},
{0x694, 0x694, 0x694, 0x694, 0x694},
{0x695, 0x695, 0x695, 0x695, 0x695},
{0x696, 0x696, 0x696, 0x696, 0x696},
{0x697, 0, 0, 0, 0},
{0x698, 0xfb8b, 0xfb8a, 0xfb8b, 0xfb8a},
{0x699, 0, 0, 0, 0},
{0x69a, 0x69a, 0x69a, 0x69a, 0x69a},
{0x69B, 0, 0, 0, 0},
{0x69C, 0, 0, 0, 0},
{0x69D, 0, 0, 0, 0},
{0x69E, 0, 0, 0, 0},
{0x69F, 0, 0, 0, 0},
{0x6a0, 0x6a0, 0x6a0, 0x6a0, 0x6a0},
{0x6A1, 0, 0, 0, 0},
{0x6A2, 0, 0, 0, 0},
{0x6A3, 0, 0, 0, 0},
{0x6a4, 0x6a4, 0x6a4, 0x6a4, 0x6a4},
{0x6A5, 0, 0, 0, 0},
{0x6A6, 0, 0, 0, 0},
{0x6A7, 0, 0, 0, 0},
{0x6A8, 0, 0, 0, 0},
{0x6a9, 0xfb8f, 0xfb90, 0xfb91, 0xfb8e},
{0x6AA, 0, 0, 0, 0},
{0x6ab, 0x6ab, 0x6ab, 0x6ab, 0x6ab},
{0x6ac, 0x6ac, 0x6ac, 0x6ac, 0x6ac},
{0x6ad, 0x6ad, 0x6ad, 0x6ad, 0x6ad},
{0x6AE, 0, 0, 0, 0},
{0x6af, 0xfb93, 0xfb94, 0xfb95, 0xfb92},
{0x6B0, 0, 0, 0, 0},
{0x6B1, 0, 0, 0, 0},
{0x6B2, 0, 0, 0, 0},
{0x6B3, 0, 0, 0, 0},
{0x6B4, 0, 0, 0, 0},
{0x6b5, 0x6b5, 0x6b5, 0x6b5, 0x6b5},
{0x6b6, 0x6b6, 0x6b6, 0x6b6, 0x6b6},
{0x6b7, 0x6b7, 0x6b7, 0x6b7, 0x6b7},
{0x6B8, 0, 0, 0, 0},
{0x6B9, 0, 0, 0, 0},
{0x6ba, 0xfb9f, 0xfb9e, 0xfb9f, 0xfb9e},
{0x6BB, 0, 0, 0, 0},
{0x6bc, 0x6bc, 0x6bc, 0x6bc, 0x6bc},
{0x6bd, 0x6bd, 0x6bd, 0x6bd, 0x6bd},
{0x6be, 0xfbab, 0xfbac, 0xfbad, 0xfbaa},
{0x6BF, 0, 0, 0, 0},
{0x6c0, 0x6c0, 0x6c0, 0x6c0, 0x6c0},
{0x6c1, 0xfba7, 0xfba6, 0xfba7, 0xfba6},
{0x6c2, 0xfba5, 0xfba4, 0xfba5, 0xfba4},
{0x6c3, 0x6c3, 0x6c3, 0x6c3, 0x6c3},
{0x6C4, 0, 0, 0, 0},
{0x6C5, 0, 0, 0, 0},
{0x6c6, 0x6c6, 0x6c6, 0x6c6, 0x6c6},
{0x6c7, 0x6c7, 0x6c7, 0x6c7, 0x6c7},
{0x6c8, 0x6c8, 0x6c8, 0x6c8, 0x6c8},
{0x6c9, 0x6c9, 0x6c9, 0x6c9, 0x6c9},
{0x6CA, 0, 0, 0, 0},
{0x6cb, 0x6cb, 0x6cb, 0x6cb, 0x6cb},
{0x6cc, 0xfbfd, 0xfbfe, 0xfbff, 0xfbfc},
{0x6cd, 0x6cd, 0x6cd, 0x6cd, 0x6cd},
{0x6ce, 0x6ce, 0x6ce, 0x6ce, 0x6ce},
{0x6cf, 0x6cf, 0x6cf, 0x6cf, 0x6cf},
{0x6D0, 0x6D0, 0x6D0, 0x6D0, 0x6D0},
{0x6D1, 0x6D1, 0x6D1, 0x6D1, 0x6D1},
{0x6D2, 0xFBAF, 0xFBAE, 0xFBAF, 0xFBAE},
{0x6D3, 0xFBB1, 0xFBB0, 0xFBB1, 0xFBB0},
{0x6D4, 0x6D4, 0x6D4, 0x6D4, 0x6D4},
{0x6D5, 0x6D5, 0x6D5, 0x6D5, 0x6D5},
{0x6D6, 0, 0, 0, 0},
{0x6D7, 0, 0, 0, 0},
{0x6D8, 0, 0, 0, 0},
{0x6D9, 0, 0, 0, 0},
{0x6DA, 0, 0, 0, 0},
{0x6DB, 0, 0, 0, 0},
{0x6DC, 0, 0, 0, 0},
{0x6DD, 0, 0, 0, 0},
{0x6DE, 0, 0, 0, 0},
{0x6DF, 0, 0, 0, 0},
{0x6E0, 0, 0, 0, 0},
{0x6E1, 0, 0, 0, 0},
{0x6E2, 0, 0, 0, 0},
{0x6E3, 0, 0, 0, 0},
{0x6E4, 0, 0, 0, 0},
{0x6E5, 0, 0, 0, 0},
{0x6E6, 0, 0, 0, 0},
{0x6E7, 0, 0, 0, 0},
{0x6E8, 0, 0, 0, 0},
{0x6E9, 0, 0, 0, 0},
{0x6EA, 0, 0, 0, 0},
{0x6EB, 0, 0, 0, 0},
{0x6EC, 0, 0, 0, 0},
{0x6ED, 0, 0, 0, 0},
{0x6EE, 0, 0, 0, 0},
{0x6EF, 0, 0, 0, 0},
{0x6F0, 0x6F0, 0x6F0, 0x6F0, 0x6F0},
{0x6F1, 0x6F1, 0x6F1, 0x6F1, 0x6F1},
{0x6F2, 0x6F2, 0x6F2, 0x6F2, 0x6F2},
{0x6F3, 0x6F3, 0x6F3, 0x6F3, 0x6F3},
{0x6F4, 0x6F4, 0x6F4, 0x6F4, 0x6F4},
{0x6F5, 0x6F5, 0x6F5, 0x6F5, 0x6F5},
{0x6F6, 0x6F6, 0x6F6, 0x6F6, 0x6F6},
{0x6F7, 0x6F7, 0x6F7, 0x6F7, 0x6F7},
{0x6F8, 0x6F8, 0x6F8, 0x6F8, 0x6F8},
{0x6F9, 0x6F9, 0x6F9, 0x6F9, 0x6F9},
Next I updated Set 1 and 2 to include the new characters:
private static readonly int[] set1 =
{
0x626, 0x628, 0x62a, 0x62b, 0x62c, 0x62d, 0x62e, 0x633,
0x634, 0x635, 0x636, 0x637, 0x638, 0x639, 0x63a, 0x640,
0x641, 0x642, 0x643, 0x644, 0x645, 0x646, 0x647, 0x64a,
0x679, 0x67E, 0x686, 0x698, 0x6A9, 0x6AF, 0x6BE, 0x6CC
};
private static readonly int[] set2 =
{
0x622, 0x623, 0x624, 0x625, 0x627, 0x629, 0x62f, 0x630,
0x631, 0x632, 0x648, 0x649, 0x671, 0x688, 0x691, 0x6BA,
0x6C1, 0x6C2, 0x6C3, 0x6D2, 0x6D3
};
and finally I had to modify the IsArabic function to handle the new linking characters and numbers:
private static bool IsArabic(char ch)
{
int ch1 = ch;
if((((ch1 & 0xff00) ^ 0x0600) != 0) &&
(((ch1 & 0xff00) ^ 0xfe00) != 0) &&
(((ch1 & 0xff00) ^ 0xfb00) != 0) ||
(((ch1 & 0xfff0) ^ 0x0660) == 0) ||
(((ch1 & 0xfff0) ^ 0x06f0) == 0))
return false;
return BinarySearch(arabicArray, ch1) >= 0;
}
I have tried to do my best with regard to getting the right characters in the right places but as I said in my previous post, I don't know Arabic or Urdu so there may be mistakes. If someone who does know Arabic/Urdu and can find a problem then I am certainly interested to know!
I would also like to send out my sincere thanks to Mohamed Abdel-Monem and Sherif ElMetainy for their fantastic work!
Enjoy..
Andrew
|