views:

58

answers:

2

I've implemented some of the changes suggested in this question, and (thanks very much) it works quite well, however... in the process I've seemed to break the post-declaration assignment operator. With the following code:

#include <cstdio>
#include "ucpp"
main() {
  ustring a = "test";
  ustring b = "ing";
  ustring c = "- -";
  ustring d = "cafe\xcc\x81";
  printf("%s\n", (a + b + c[1] + d).encode());
}

I get a nice "testing café" message. However, if I modify the code slightly so that the const char * conversion is done separately, post-declaration:

#include <cstdio>
#include "ucpp"
main() {
  ustring a = "test";
  ustring b = "ing";
  ustring c = "- -";
  ustring d;
  d = "cafe\xcc\x81";
  printf("%s\n", (a + b + c[1] + d).encode());
}

the ustring named d becomes blank, and all that is output is "testing ". My new code has three constructors, one void (which is probably the one being incorrectly used, and is used in the operator+ function), one that takes a const ustring &, and one that takes a const char *. The following is my new library code:

#include <cstdlib>
#include <cstring>
class ustring {
  int * values;
  long len;
  public:
  long length() {
    return len;
  }
  ustring() {
    len = 0;
    values = (int *) malloc(0);
  }
  ustring(const ustring &input) {
    len = input.len;
    values = (int *) malloc(sizeof(int) * len);
    for (long i = 0; i < len; i++)
      values[i] = input.values[i];
  }
  ustring operator=(ustring input) {
    ustring result(input);
    return result;
  }
  ustring(const char * input) {
    values = (int *) malloc(0);
    long s = 0;                                                                 // s = number of parsed chars
    int a, b, c, d, contNeed = 0, cont = 0;
    for (long i = 0; input[i]; i++)
      if (input[i] < 0x80) {                                                    // ASCII, direct copy (00-7f)
        values = (int *) realloc(values, sizeof(int) * ++s);
        values[s - 1] = input[i];
      } else if (input[i] < 0xc0) {                                             // this is a continuation (80-bf)
        if (cont == contNeed) {                                                 // no need for continuation, use U+fffd
          values = (int *) realloc(values, sizeof(int) * ++s);
          values[s - 1] = 0xfffd;
        }
        cont = cont + 1;
        values[s - 1] = values[s - 1] | ((input[i] & 0x3f) << ((contNeed - cont) * 6));
        if (cont == contNeed) cont = contNeed = 0;
      } else if (input[i] < 0xc2) {                                             // invalid byte, use U+fffd (c0-c1)
        values = (int *) realloc(values, sizeof(int) * ++s);
        values[s - 1] = 0xfffd;
      } else if (input[i] < 0xe0) {                                             // start of 2-byte sequence (c2-df)
        contNeed = 1;
        values = (int *) realloc(values, sizeof(int) * ++s);
        values[s - 1] = (input[i] & 0x1f) << 6;
      } else if (input[i] < 0xf0) {                                             // start of 3-byte sequence (e0-ef)
        contNeed = 2;
        values = (int *) realloc(values, sizeof(int) * ++s);
        values[s - 1] = (input[i] & 0x0f) << 12;
      } else if (input[i] < 0xf5) {                                             // start of 4-byte sequence (f0-f4)
        contNeed = 3;
        values = (int *) realloc(values, sizeof(int) * ++s);
        values[s - 1] = (input[i] & 0x07) << 18;
      } else {                                                                  // restricted or invalid (f5-ff)
        values = (int *) realloc(values, sizeof(int) * ++s);
        values[s - 1] = 0xfffd;
      }
    len = s;
  }
  ustring operator=(const char * input) {
    ustring result(input);
    return result;
  }
  ustring operator+(ustring input) {
    ustring result;
    result.len = len + input.len;
    result.values = (int *) malloc(sizeof(int) * result.len);
    for (long i = 0; i < len; i++)
      result.values[i] = values[i];
    for (long i = 0; i < input.len; i++)
      result.values[i + len] = input.values[i];
    return result;
  }
  ustring operator[](long index) {
    ustring result;
    result.len = 1;
    result.values = (int *) malloc(sizeof(int));
    result.values[0] = values[index];
    return result;
  }
  char * encode() {
    char * r = (char *) malloc(0);
    long s = 0;
    for (long i = 0; i < len; i++) {
      if (values[i] < 0x80)
        r = (char *) realloc(r, s + 1),
        r[s + 0] = char(values[i]),
        s += 1;
      else if (values[i] < 0x800)
        r = (char *) realloc(r, s + 2),
        r[s + 0] = char(values[i] >> 6 | 0x60),
        r[s + 1] = char(values[i] & 0x3f | 0x80),
        s += 2;
      else if (values[i] < 0x10000)
        r = (char *) realloc(r, s + 3),
        r[s + 0] = char(values[i] >> 12 | 0xe0),
        r[s + 1] = char(values[i] >> 6 & 0x3f | 0x80),
        r[s + 2] = char(values[i] & 0x3f | 0x80),
        s += 3;
      else
        r = (char *) realloc(r, s + 4),
        r[s + 0] = char(values[i] >> 18 | 0xf0),
        r[s + 1] = char(values[i] >> 12 & 0x3f | 0x80),
        r[s + 2] = char(values[i] >> 6 & 0x3f | 0x80),
        r[s + 3] = char(values[i] & 0x3f | 0x80),
        s += 4;
    }
    return r;
  }
};
A: 

Both assignment operators are broken, e.g., this:

  ustring operator=(const char * input) {
    ustring result(input);
    return result;
  }

Does nothing to the target object. It just creates a local temporary and returns that. Write them like this instead:

  ustring& operator=(ustring input) {
    swap(input);
    return *this;
  }

  ustring& operator=(const char * input) {
    swap(ustring(input));
    return *this;
  }

  void swap(ustring& s) {
    int* tv = values; values = s.values; s.values = tv;
    long tl = len; len = s.len; s.len = tl;
  }
Marcelo Cantos
Thanks very much for the advice!
Delan Azabani
Your swap seems quite long winded. Is there anything wrong with `std::swap(values, s.values); std::swap(len, s.len);` which I would find significantly more readable?
Charles Bailey
+3  A: 

operator= should modify *this. The returned value (which you better make a reference) is only used in chaining situations:

a = b = c;
(a = b).foo();
//etc.
UncleBens
Thank you very much. I understand the difference clearly now.
Delan Azabani