IDNA resolver client

Gisle Vanem
Rate me:
3.33/5 (3 votes)
30 Mar 2004BSD7 min read
96.5K
849
Using non-ASCII in DNS name lookups.
idna_client.zip
- demo-1-gcc.exe
- demo-1-vc.exe
- demo-2-gcc.exe
- demo-2-vc.exe
- draft-skwan-utf8-dns-02.txt
- getopt.c
- getopt.h
- idna.c
- idna.h
- Makefile
- punycode.c
- punycode.h
- readme.txt
- rfc2181.txt
- rfc3490.txt
- rfc3492.txt
/*
 * Code for enabling lookup of names with non-ASCII letters via
 * ACE and IDNA (Internationalizing Domain Names in Applications)
 * Ref. RFC-3490.
 */

/*  \version 0.1: Mar 19, 2004 :
 *    G. Vanem - Created.
 *
 *  \version 0.2: Mar 29, 2004 :
 *    G. Vanem - Adapted for Windows (MSVC+MingW) and C++.
 */

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include "punycode.h"
#include "idna.h"

#define DIM(array)     ((int)(sizeof(array) / sizeof(array[0])))
#define ARGSUSED(foo)  (void)foo

#ifdef UNICODE
  #define STR_FMT  "%S"
  #define ATOI(s)  _wtoi (s)
#else
  #define STR_FMT  "%s"
  #define ATOI(s)  atoi (s)
#endif

#define IDNA_DEBUG(lvl, args) \
        do { \
          if (_idna_debug >= lvl && _idna_printf) { \
            (*_idna_printf) ("%s(%u): ", __FILE__, __LINE__); \
            (*_idna_printf) args; \
            if (_idna_printf == printf) \
               fflush (stdout); \
          } \
        } while (0)

int            _idna_winnls_errno = 0;
int            _idna_errno = 0;
int            _idna_debug = 0;
int (MS_CDECL *_idna_printf) (const char *fmt, ...) = printf;

/*
 * The following string is used to convert printable
 * Punycode characters to ASCII:
 */
static const char print_ascii[] = "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
                                  "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
                                  " !\"#$%&'()*+,-./"
                                  "0123456789:;<=>?"
                                  "@ABCDEFGHIJKLMNO"
                                  "PQRSTUVWXYZ[\\]^_"
                                  "`abcdefghijklmno"
                                  "pqrstuvwxyz{|}~\n";

static CRITICAL_SECTION critSection;
static UINT             cur_cp = CP_ACP;

/*
 * Get ANSI/system codepage.
 */
UINT IDNA_GetCodePage (void)
{
  CPINFOEX CPinfo;
  UINT     CP = 0;

  IDNA_DEBUG (2, ("OEM codepage %u\n", GetOEMCP()));
  CP = GetACP();

  if (GetCPInfoEx(CP, 0, &CPinfo))
     IDNA_DEBUG (2, ("ACP-name " STR_FMT "\n", CPinfo.CodePageName));
  return (CP);
}

/*
 * Callback for EnumSystemCodePages()
 */
static BOOL cp_found = FALSE;
static UINT cp_requested = 0;

static BOOL CALLBACK print_cp_info (LPTSTR cp_str)
{
  CPINFOEX cp_info;
  UINT     cp = ATOI (cp_str);

  if(!IsValidCodePage(cp))
  {
    IDNA_DEBUG (1, ("INVALID CODEPAGE: %u\n", cp));
    return (TRUE);
  }
  if (cp == cp_requested)
     cp_found = TRUE;

  IDNA_DEBUG (3, ("CP: %5u, ", cp));

  if (GetCPInfoEx(cp, 0, &cp_info))
       IDNA_DEBUG (3, ("name: " STR_FMT "\n", cp_info.CodePageName));
  else IDNA_DEBUG (3, ("name: <unknown>\n"));
  return (TRUE);
}

/*
 * Check if given codepage is available
 */
BOOL IDNA_CheckCodePage (UINT cp)
{
  cp_requested = cp;
  cp_found = FALSE;
  EnumSystemCodePages (print_cp_info, CP_INSTALLED);
  return (cp_found);
}

static void IDNA_exit (void)
{
  DeleteCriticalSection (&critSection);
}

/*
 * A safer strncpy()
 */
static char *StrLcpy (char *dst, const char *src, size_t len)
{
  assert (src != NULL);
  assert (dst != NULL);
  assert (len > 0);

  if (strlen(src) < len)
     return strcpy (dst, src);

  memcpy (dst, src, len);
  dst [len-1] = '\0';
  return (dst);
}

/*
 * Get active codpage and initialise crit-section.
 */
BOOL IDNA_init (WORD cp)
{
  if (cp == 0)
  {
    cp = IDNA_GetCodePage();
  }
  else if (!IDNA_CheckCodePage(cp))
  {
    _idna_errno = IDNAERR_ILL_CODEPAGE;
    _idna_winnls_errno = GetLastError();
    IDNA_DEBUG (0, ("IDNA_init: %s\n", IDNA_strerror(_idna_errno)));
    return (FALSE);
  }

  cur_cp = cp;
  IDNA_DEBUG (2, ("IDNA_init: Using codepage %u\n", cp));

  InitializeCriticalSection (&critSection);
  atexit (IDNA_exit);
  return (TRUE);
}

const char *IDNA_strerror (int err)
{
  static char buf[200];

  switch ((enum IDNA_errors)err)
  {
    case IDNAERR_OK:
         return ("No error");
    case IDNAERR_NOT_INIT:
         return ("Not initialised");
    case IDNAERR_PUNYCODE_BASE:
         return ("No Punycode error");
    case IDNAERR_PUNYCODE_BAD_INPUT:
         return ("Bad Punycode input");
    case IDNAERR_PUNYCODE_BIG_OUTBUF:
         return ("Punycode output buf too small");
    case IDNAERR_PUNYCODE_OVERFLOW:
         return ("Punycode arithmetic overflow");
    case IDNAERR_PUNY_ENCODE:
         return ("Mysterious Punycode encode result");
    case IDNAERR_ILL_CODEPAGE:
         return ("Illegal or no Codepage defined");
    case IDNAERR_WINNLS:
         if (FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, _idna_winnls_errno,
                            LANG_NEUTRAL, buf, sizeof(buf)-1, NULL))
            return (buf);
  }
  sprintf (buf, "Unknown %d", err);
  return (buf);
}

/*
 * Convert a single ASCII codepoint from active codepage to Unicode.
 */
static BOOL conv_to_unicode (char ch, wchar_t *wc)
{
  int rc = MultiByteToWideChar (cur_cp, 0, (LPCSTR)&ch, 1, wc, 1);

  if (rc == 0)
  {
    _idna_winnls_errno = GetLastError();
    _idna_errno = IDNAERR_WINNLS;
    IDNA_DEBUG (1, ("conv_to_unicode failed; %s\n", IDNA_strerror(_idna_winnls_errno)));
    return (FALSE);
  }
  return (TRUE);
}

/*
 * Convert a single Unicode codepoint to ASCII in active codepage.
 * Allow 4 byte GB18030 Simplified Chinese to be converted.
 */
static BOOL conv_to_ascii (wchar_t wc, char *ch, int *len)
{
  int rc = WideCharToMultiByte (cur_cp, 0, &wc, 1, (LPSTR)ch, 4, NULL, NULL);

  if (rc == 0)
  {
    _idna_winnls_errno = GetLastError();
    _idna_errno = IDNAERR_WINNLS;
    IDNA_DEBUG (1, ("conv_to_ascii failed; %s\n", IDNA_strerror(_idna_winnls_errno)));
    return (FALSE);
  }
  *len = rc;
  return (TRUE);
}

/*
 * Split a domain-name into labels (no trailing dots)
 */
static char **split_labels (const char *name)
{
  static char  buf [MAX_HOST_LABELS][MAX_HOST_LEN];
  static char *res [MAX_HOST_LABELS+1];
  const  char *p = name;
  int    i;

  for (i = 0; i < MAX_HOST_LABELS && *p; i++)
  {
    const char *dot = strchr (p, '.');

    if (!dot)
    {
      res[i] = StrLcpy (buf[i], p, sizeof(buf[i]));
      i++;
      break;
    }
    res[i] = StrLcpy (buf[i], p, dot-p+1);
    p = ++dot;
  }
  res[i] = NULL;
  IDNA_DEBUG (3, ("split_labels: `%s', %d labels\n", name, i));
  return (res);
}

/*
 * Convert a single label to ACE form
 */
static char *convert_to_ACE (const char *name)
{
  static char out_buf [2*MAX_HOST_LEN];  /* A conservative guess */
  DWORD  ucs_input [MAX_HOST_LEN];
  BYTE   ucs_case [MAX_HOST_LEN];
  const  char *p;
  size_t in_len, out_len;
  int    i, c;
  punycode_status status;

  for (i = 0, p = name; *p; i++)
  {
    wchar_t ucs = 0;

    c = *p++;
    if (!conv_to_unicode (c, &ucs))
       break;
    ucs_input[i] = ucs;
    ucs_case[i]  = 0;
    IDNA_DEBUG (3, ("%c -> u+%04X\n", c, ucs));
  }
  in_len  = i;
  out_len = sizeof(out_buf);
  status  = punycode_encode (in_len, ucs_input, ucs_case, &out_len, out_buf);

  if (status != punycode_success)
  {
    _idna_errno = IDNAERR_PUNYCODE_BASE + status;
    out_len = 0;
  }

  for (i = 0; i < (int)out_len; i++)
  {
    c = out_buf[i];
    if (c < 0 || c > 127)
    {
      _idna_errno = IDNAERR_PUNY_ENCODE;
      IDNA_DEBUG (1, ("illegal Punycode result: %c (%d)\n", c, c));
      break;
    }
    if (!print_ascii[c])
    {
      _idna_errno = IDNAERR_PUNY_ENCODE;
      IDNA_DEBUG (1, ("Punycode not ASCII: %c (%d)\n", c, c));
      break;
    }
    out_buf[i] = print_ascii[c];
  }
  out_buf[i] = '\0';

  IDNA_DEBUG (2, ("punycode_encode: status %d, out_len %d, out_buf `%s'\n",
              status, out_len, out_buf));
  if (status == punycode_success && i == (int)out_len)   /* encoding and ASCII conversion okay */
     return (out_buf);
  return (NULL);
}

/*
 * Convert a single ACE encoded label to native encoding
 * u+XXXX is used to signify a lowercase character.
 * U+XXXX is used to signify a uppercase character.
 * Normally only lowercase should be expected here.
 */
static char *convert_from_ACE (const char *name)
{
  static char out_buf [MAX_HOST_LEN];
  DWORD  ucs_output [MAX_HOST_LEN];
  BYTE   ucs_case  [MAX_HOST_LEN];
  size_t ucs_len, i, j;
  punycode_status status;

  memset (&ucs_case, 0, sizeof(ucs_case));
  ucs_len = sizeof(ucs_output);
  status = punycode_decode (strlen(name), name, &ucs_len, ucs_output, ucs_case);

  if (status != punycode_success)
  {
    _idna_errno = IDNAERR_PUNYCODE_BASE + status;
    ucs_len = 0;
  }

  for (i = j = 0; i < ucs_len && j < sizeof(out_buf)-4; i++)
  {
    wchar_t ucs = ucs_output[i];
    int     len;

    if (!conv_to_ascii(ucs, out_buf+j, &len))
       break;
    IDNA_DEBUG (3, ("%c+%04X -> %.*s\n",
                ucs_case[i] ? 'U' : 'u', ucs, len, out_buf+j));
    j += len;
  }
  out_buf[j] = '\0';
  IDNA_DEBUG (2, ("punycode_decode: status %d, out_len %d, out_buf `%s'\n",
              status, ucs_len, out_buf));
  return (status == punycode_success ? out_buf : NULL);
}


/*
 * E.g. convert "www.troms�.no" to ACE:
 *
 * 1) Convert each label separately. "www", "troms�" and "no"
 * 2) "troms�" -> u+0074 u+0072 u+006F u+006D u+0073 u+00F8
 * 3) Pass this through `punycode_encode()' which gives "troms-zua".
 * 4) Repeat for all labels with non-ASCII letters.
 * 5) Prepending "xn--" for each converted label gives "www.xn--troms-zua.no".
 *
 * E.g. 2:
 *   "www.bl�b�rsyltet�y.no" -> "www.xn--blbrsyltety-y8aO3x.no"
 *
 * Ref. http://www.imc.org/idna/do-idna.cgi
 *      http://www.norid.no/domenenavnbaser/ace/ace_technical.en.html
 */
BOOL IDNA_convert_to_ACE (
          char   *name,   /* IN/OUT: native ASCII/ACE name */
          size_t *size)   /* IN:     length of name buf */
{                         /* OUT:    ACE encoded length */
  const  BYTE *p;
  const  char *ace;
  char  *in_name = name;
  char **labels;
  int    i;
  size_t len = 0;
  BOOL   rc = FALSE;

  EnterCriticalSection (&critSection);

  labels = split_labels (name);

  for (i = 0; labels[i]; i++)
  {
    ace = NULL;
    for (p = (const BYTE*)labels[i]; *p; p++)
        if (*p >= 0x80)  /* !! this may not be true for all codepages */
        {
          ace = convert_to_ACE (labels[i]);
          if (!ace)
             goto quit;
          break;
        }

    if (ace)
    {
      if (len + 5 + strlen(ace) > *size)
      {
        IDNA_DEBUG (1, ("input length exceeded\n"));
        goto quit;
      }
      name += sprintf (name, "xn--%s.", ace);
    }
    else  /* pass through unchanged */
    {
      if (len + 1 + strlen(labels[i]) > *size)
      {
        IDNA_DEBUG (1, ("input length exceeded\n"));
        goto quit;
      }
      name += sprintf (name, "%s.", labels[i]);
    }
  }
  if (in_name > name)   /* drop trailing '.' */
     name--;
  len = name - in_name;
  *name = '\0';
  *size = len;
  IDNA_DEBUG (2, ("IDNA_convert_to_ACE: `%s', %d bytes\n", in_name, len));
  rc = TRUE;

quit:
  LeaveCriticalSection (&critSection);
  return (rc);
}

/*
 * 1) Pass through labels w/o "xn--" prefix unaltered.
 * 2) Strip "xn--" prefix and pass to punycode_decode()
 * 3) Repeat for all labels with "xn--" prefix.
 * 4) Collect Unicode strings and convert to original codepage.
 */
BOOL IDNA_convert_from_ACE (
          char   *name,    /* IN/OUT: ACE/native ASCII name */
          size_t *size)    /* IN:     ACE raw string length */
{                          /* OUT:    ASCII decoded length */
  char  *in_name = name;
  char **labels;
  int    i;
  BOOL   rc = FALSE;

  EnterCriticalSection (&critSection);

  labels  = split_labels (name);

  for (i = 0; labels[i]; i++)
  {
    const char *ascii = NULL;
    const char *label = labels[i];

    if (!strncmp(label,"xn--",4) && label[4])
    {
      ascii = convert_from_ACE (label+4);
      if (!ascii)
         goto quit;
    }
    name += sprintf (name, "%s.", ascii ? ascii : label);
  }
  if (name > in_name)
     name--;
  *name = '\0';
  *size = name - in_name;
  rc = TRUE;

quit:
  LeaveCriticalSection (&critSection);
  return (rc);
}

/*
 * The rest is C++
 */
#ifdef __cplusplus
struct hostent *CIDNA_resolver::gethostbyname (const char *name)
{
  /* if the name is in the hosts file (or LMHOSTS or a WINS server) in native CP,
   * return that.
   */
  struct hostent *he = ::gethostbyname (name);
  const  char    *ace;

  if (he)
     return (he);

  ace = CIDNA_convert::convert_to_ACE (name);
  if (!ace)
     return (NULL);

  he = ::gethostbyname (ace);
  if (he)
     he->h_aliases[0] = (char*) ace;
  return (he);
}

struct hostent *CIDNA_resolver::gethostbyaddress (const char *addr_name, int size, int af)
{
  struct hostent *he = ::gethostbyaddr (addr_name, size, af);
  const  char    *name;

  if (!he || !he->h_name)
     return (NULL);

  name = CIDNA_convert::convert_from_ACE (he->h_name);
  if (name)
  {
    he->h_aliases[0] = he->h_name;
    he->h_name = (char*) name;
  }
  return (he);
}

/*
 * TODO:
 *
 * There seems to be considerable confusion on what the reentrant versions should
 * look like. Some specifies returning 'int' (Posix?). OpenBSD/NetBSD return
 * 'struct hostent*'. So I leave this as an excercise for the guru reader...
 */
#if 0
int CIDNA_resolver::gethostbyname_r (
    const char      *name,
    struct hostent  *res_buf,
    char            *buf,
    size_t           buflen,
    struct hostent **result,
    int             *h_errno_p)
{
  if (result)
     *result = res_buf;
  if (h_errno_p)
     *h_errno_p = h_errno;
  return (-1);
}

int CIDNA_resolver::gethostbyaddr_r (
    const char      *name,
    struct hostent  *res_buf,
    char            *buf,
    size_t           buflen,
    struct hostent **result,
    int             *h_errno_p)
{
  if (result)
     *result = res_buf;
  if (h_errno_p)
     *h_errno_p = h_errno;
  return (-1);
}

int CIDNA_resolver::gethostbyaddr_r (
    const char      *addr,
    int              len,
    int              type,
    struct hostent  *res_buf,
    char            *buf,
    size_t           buflen,
    struct hostent **result,
    int             *h_errno_p)
{
  if (result)
     *result = res_buf;
  if (h_errno_p)
     *h_errno_p = h_errno;
  return (-1);
}
#endif
#endif
By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.
If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.
License

This article, along with any associated source code and files, is licensed under The BSD License
Written By
Gisle Vanem
Norway
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.
IDNA resolver client

License

Comments and Discussions