utf 8 – A C++ function to read Code Points from an UTF-8 Stream


I’ve written a function that reads and returns one UTF-8 code point from an istream. I am wondering if the code is efficient or if there are some obvious problems with the implementation.

chr_t utf32::get_utf32_char(std::istream &in_stream) {
    int next;
    chr_t out = in_stream.get();
    if (out == -1 || out < 0x80) {
        return out;
    } else if ((out & 0xe0) == 0xc0) {
        out &= 0x1f;
        out <<= 6;
        next = in_stream.get();
        if (next == -1) goto invalid_seq;
        out |= next & 0x3F;
        return out;
    } else if ((out & 0xf0) == 0xe0) {
        out &= 0x0f;
        out <<= 12;
        next = in_stream.get();
        if (next == -1) goto invalid_seq;
        out |= (next & 0x3F) << 6;
        next = in_stream.get();
        if (next == -1) goto invalid_seq;
        out |= next & 0x3F;
        return out;
    } else if ((out & 0xf8) == 0xf0) {
        out &= 0x07;
        out <<= 18;
        next = in_stream.get();
        if (next == -1) goto invalid_seq;
        out |= (next & 0x3F) << 12;
        next = in_stream.get();
        if (next == -1) goto invalid_seq;
        out |= (next & 0x3F) << 6;
        next = in_stream.get();
        if (next == -1) goto invalid_seq;
        out |= next & 0x3F;
        return out;
    } else {
        throw std::runtime_error("invalid utf8 character");
    }
invalid_seq:
    throw std::runtime_error("unexpected end of utf8 sequence");
}