A Rust Crate that
Also Quacks Like
a Modern C++ Library

Henri Sivonen
Mozilla

encoding_rs

  • Conforms to the WHATWG Encoding Standard
  • Replaced a C++ library from 1999
  • To and from both UTF-8 and UTF-16

Basic Design

  • Encoding
    • Label resolution
    • Non-streaming conversion
  • Decoder & Encoder
    • Streaming
    • Encoding is the factory

Obtaining Instances

let encoding: &'static Encoding =
    Encoding::for_label( // by label
        byte_slice_from_protocol
    ).unwrap_or(
        WINDOWS_1252     // by named static
    );

let decoder: Decoder =
    encoding.new_decoder();

enum-Based Polymorphism

pub struct Decoder { // no vtable
   variant: VariantDecoder,
   // ...
}

pub enum VariantDecoder { // no extensibility
    SingleByte(SingleByteDecoder),
    Utf8(Utf8Decoder),
    Gb18030(Gb18030Decoder),
    // ...
}

Streaming

pub enum DecoderResult {
    InputEmpty,
    OutputFull,
    Malformed(u8, u8),
}

impl Decoder {
    pub fn decode_to_utf16_without_replacement(
        &mut self,
        src: &[u8],
        dst: &mut [u16],
        last: bool
    ) -> (DecoderResult, usize, usize)
}

Non-Streaming

impl Encoding {
    pub fn decode_without_bom_handling_and_without_replacement
    <'a>(
        &'static self,
        bytes: &'a [u8],
    ) -> Option<Cow<'a, str>>
}

C++ Core Guidelines

  • Edited by Bjarne Stroustrup & Herb Sutter
  • Guidelines for using modern C++ well
  • Includes a number of Rustic things

Guidelines Support Library

  • Core Guidelines require types not yet standardized
  • GSL provides those

gsl::not_null<T>

static Singletons in Rust

pub static UTF_8_INIT: Encoding = Encoding {
    name: "UTF-8",
    variant: VariantEncoding::Utf8,
};

pub static UTF_8: &'static Encoding = &UTF_8_INIT;

static Singletons in FFI

pub struct ConstEncoding(*const Encoding);

unsafe impl Sync for ConstEncoding {}

#[no_mangle]
pub static UTF_8_ENCODING: ConstEncoding =
    ConstEncoding(&UTF_8_INIT);

static Singletons in C++

class Encoding;

extern gsl::not_null<const encoding_rs::Encoding*>
const UTF_8_ENCODING;

Encoding as a C++ class

class Encoding final {
// ...
private:
    Encoding() = delete;
    Encoding(const Encoding&) = delete;
    Encoding& operator=(const Encoding&) = delete;
    ~Encoding() = delete;
};

std::unique_ptr<T>

let ptr: Box<Foo>
std::unique_ptr<Foo> ptr
Box::new(Foo::new(a, b, c))
make_unique<Foo>(a, b, c)
Box::into_raw(ptr)
ptr.release()
let ptr = Box::from_raw(raw_ptr);
std::unique_ptr<Foo> ptr(raw_ptr);

Concrete Example

impl Encoding {
    pub fn new_decoder(&'static self) -> Decoder {
        // ...
    }
}

FFI

#[no_mangle]
pub unsafe extern "C" fn encoding_new_decoder(
    encoding: *const Encoding) -> *mut Decoder
{
    Box::into_raw(Box::new((*encoding).new_decoder()))
}

#[no_mangle]
pub unsafe extern "C" fn decoder_free(decoder: *mut Decoder) {
    let _ = Box::from_raw(decoder);
}

C++ Construction

class Encoding final {
public:
    inline std::unique_ptr<Decoder> new_decoder() const
    {
        return std::unique_ptr<Decoder>(
            encoding_new_decoder(this));
    }
};

C++ Deletion

class Decoder final {
public:
    ~Decoder() {}
    static inline void operator delete(void* decoder)
    {
        decoder_free(reinterpret_cast<Decoder*>(decoder));
    }
private:
    Decoder() = delete;
    Decoder(const Decoder&) = delete;
    Decoder& operator=(const Decoder&) = delete;
};

How Is That Possible?

&self is Sugar

impl Foo {
    pub fn get_val(&self) -> usize {
        self.val
    }
}

fn test(bar: Foo) {
    assert_eq!(bar.get_val(), Foo::get_val(&bar));
}

&self and this

fn foo(&self, bar: usize) -> usize
size_t foo(size_t bar) const
fn foo(&mut self, bar: usize) -> usize
size_t foo(size_t bar)

No VTables!

  • No Rust trait objects
  • No C++ inheritance
    • nsISupports
    • QObject

std::optional<T>

return None;
return std::nullopt;
return Some(foo);
return foo;
is_some()
operator bool()
has_value()
unwrap()
value()
unwrap_or(bar)
value_or(bar)

Ergonomic Unsafety

  • operator*() is unchecked!
  • 😭

std::tuple
<Types...>

fn foo() -> (T, U, V)
std::tuple<T, U, V> foo()
return (a, b, c);
return {a, b, c};
let (a, b, c) = foo();
const auto [a, b, c] = foo();
let mut (a, b, c) = foo();
auto [a, b, c] = foo();

gsl::span<T>

Slice in C

src: &[u8]
const uint8_t* src, size_t src_len
dst: &mut [u8]
uint8_t* dst, size_t dst_len

Slice in C++

src: &[u8]
gsl::span<const uint8_t> src
dst: &mut [u8]
gsl::span<uint8_t> dst
&mut vec[..]
gsl::make_span(vec)
std::slice::from_raw_parts(ptr, len)
gsl::make_span(ptr, len)

Slice Access in C++

for item in slice {}
for (auto item : span) {}
slice[i]
span[i]
slice.len()
span.size()
slice.as_ptr()
span.data()

Subspan Problem

&slice[i..]
span.subspan(i)
&slice[..i]
span.subspan(0, i)
&slice[i..j]
span.subspan(i, j - i) 😭

mozilla::Span

&slice[i..]
span.From(i)
&slice[..i]
span.To(i)
&slice[i..j]
span.FromTo(i, j)

Read-Only String Slices

  • std::string_view
  • std::u16string_view
  • In C++17 (unlike gsl::span)
  • No guarantee of UTF validity

Zero-Length Slices

p must be non-null and aligned, even for zero-length slices, as is required for all references. However, for zero-length slices, p can be a bogus non-dereferencable pointer such as NonNull::dangling().

This slide has been edited from the version shown at RustFest in order to avoid spreading out-of-date information.

template <class T>
static inline T* null_to_bogus(T* ptr)
{
    return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
}

This slide has been edited from the version shown at RustFest in order to avoid spreading out-of-date information.

Concrete Example

impl Encoding {
    pub fn for_bom(buffer: &[u8]) ->
        Option<(&'static Encoding, usize)>
    {
        if buffer.starts_with(b"\xEF\xBB\xBF") {
            Some((UTF_8, 3))
        } else if buffer.starts_with(b"\xFF\xFE") {
            Some((UTF_16LE, 2))
        } else if buffer.starts_with(b"\xFE\xFF") {
            Some((UTF_16BE, 2))
        } else {
            None
        }
    }
}

FFI

#[no_mangle]
pub unsafe extern "C" fn encoding_for_bom(buffer: *const u8,
                                          buffer_len: *mut usize)
                                          -> *const Encoding
{
    let buffer_slice =
        ::std::slice::from_raw_parts(buffer, *buffer_len);
    let (encoding, bom_length) =
        match Encoding::for_bom(buffer_slice) {
        Some((encoding, bom_length)) =>
            (encoding as *const Encoding, bom_length),
        None => (::std::ptr::null(), 0),
    };
    *buffer_len = bom_length;
    encoding
}

C++

class Encoding final {
public:
    static inline std::optional<
        std::tuple<gsl::not_null<const Encoding*>, size_t>>
    for_bom(gsl::span<const uint8_t> buffer)
    {
        size_t len = buffer.size();
        const Encoding* encoding =
            encoding_for_bom(null_to_bogus(buffer.data()), &len);
        if (encoding) {
            return std::make_tuple(
                gsl::not_null<const Encoding*>(encoding), len);
        }
        return std::nullopt;
    }
};

std::variant
<Types...>

Not Really enum

  • Variants not named
  • Duplicate types not practical
  • No language-level match

Manual Packing into u32

pub const INPUT_EMPTY: u32 = 0;

pub const OUTPUT_FULL: u32 = 0xFFFFFFFF;

fn decoder_result_to_u32(result: DecoderResult) -> u32 {
    match result {
        DecoderResult::InputEmpty => INPUT_EMPTY,
        DecoderResult::OutputFull => OUTPUT_FULL,
        DecoderResult::Malformed(bad, good) =>
            (good as u32) << 8) | (bad as u32),
    }
}

What about the Cows?

No Borrow Checker for C++

std::variant<std::string, std::string_view>

What Was Not Here?

Viewing Non-Primitive C++ Types from Rust

Rust Holding Pointers to C++ Objects

Rust Calling Out via FFI

  • Watch out for call to C++ freeing your arguments

Recap

this is like &self

  • No C++ inheritance!
  • No Rust trait objects!

Can Declare C++-Visible Rust statics

gsl::not_null<T>

std::unique_ptr<T>

= delete;

static void
operator delete
(void*)

std::optional<T>

std::tuple
<Types...>

gsl::span<T>

std::string_view

std::variant
<Types...>

Structure of Cows
but not their Lifetimes

FIN

https://hsivonen.fi/rustfest2018/