Skip to content

Commit

Permalink
StringTable: Replace the concept of reserved StringIds with the conce…
Browse files Browse the repository at this point in the history
…pt of virtual StringIds.

With this commit only "virtual" StringIds get an entry in the index table and regular StringIds
store an actual address instead of index table key. That makes the index data a lot smaller
and removes the need to do a table lookup for regular StringIds.
  • Loading branch information
michaelwoerister committed Dec 10, 2019
1 parent e1127aa commit 47ca752
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 97 deletions.
92 changes: 74 additions & 18 deletions analyzeme/src/stringtable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,9 @@ use std::borrow::Cow;
use std::error::Error;
use memchr::memchr;

// See module-level documentation for more information on the encoding.
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;

fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
(
StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
StringId::new(LittleEndian::read_u32(&bytes[0..4])),
Addr(LittleEndian::read_u32(&bytes[4..8])),
)
}
Expand All @@ -29,12 +25,29 @@ pub struct StringRef<'st> {
table: &'st StringTable,
}

// This is the text we emit when encountering a virtual string ID that cannot
// be resolved.
const UNKNOWN_STRING: &str = "<unknown>";

impl<'st> StringRef<'st> {

/// Expands the StringRef into an actual string. This method will
/// avoid allocating a `String` if it can instead return a `&str` pointing
/// into the raw string table data.
pub fn to_string(&self) -> Cow<'st, str> {

// Try to avoid the allocation, which we can do if this is a
// [value, 0xFF] entry.
let addr = self.table.index[&self.id];
let addr = match self.get_addr() {
Ok(addr) => addr,
Err(_) => {
return Cow::from(UNKNOWN_STRING)
}
};

// Try to avoid the allocation, which we can do if this is
//
// - a string with a single value component (`[value, 0xFF]`) or
// - a string with a single reference component (`[string_id, 0xFF]`)

let pos = addr.as_usize();
let slice_to_search = &self.table.string_data[pos..];

Expand All @@ -43,36 +56,53 @@ impl<'st> StringRef<'st> {
// is super fast.
let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();

// Check if this is a string containing a single StringId component
let first_byte = self.table.string_data[pos];
const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
let id = decode_string_id_from_data(&self.table.string_data[pos..pos+STRING_ID_SIZE]);
return StringRef {
id,
table: self.table,
}.to_string();
}

// Decode the bytes until the terminator. If there is a string id in
// between somewhere this will fail, and we fall back to the allocating
// path.
if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
Cow::from(s)
} else {
// This is the slow path where we actually allocate a `String` on
// the heap and expand into that. If you suspect that there is a
// bug in the fast path above, you can easily check if always taking
// the slow path fixes the issue.
let mut output = String::new();
self.write_to_string(&mut output);
Cow::from(output)
}
}

pub fn write_to_string(&self, output: &mut String) {
let addr = self.table.index[&self.id];

let addr = match self.get_addr() {
Ok(addr) => addr,
Err(_) => {
output.push_str(UNKNOWN_STRING);
return
}
};

let mut pos = addr.as_usize();

loop {
let byte = self.table.string_data[pos];

if byte == TERMINATOR {
return;
} else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
// This is a string-id
let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);

// Mask off the `0b10` prefix
let id = id & STRING_ID_MASK;

} else if is_utf8_continuation_byte(byte) {
let string_ref = StringRef {
id: StringId::reserved(id),
id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
table: self.table,
};

Expand All @@ -87,6 +117,32 @@ impl<'st> StringRef<'st> {
}
}
}

fn get_addr(&self) -> Result<Addr, ()> {
if self.id.is_virtual() {
match self.table.index.get(&self.id) {
Some(&addr) => Ok(addr),
None => Err(()),
}
} else {
Ok(self.id.to_addr())
}
}
}

fn is_utf8_continuation_byte(byte: u8) -> bool {
// See module-level documentation for more information on the encoding.
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
(byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
}

// String IDs in the table data are encoded in big endian format, while string
// IDs in the index are encoded in little endian format. Don't mix the two up.
fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
let id = BigEndian::read_u32(&bytes[0..4]);
// Mask off the `0b10` prefix
StringId::new(id & STRING_ID_MASK)
}

// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
Expand Down Expand Up @@ -181,7 +237,7 @@ impl StringTable {
}

pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
let id = StringId::reserved(METADATA_STRING_ID);
let id = StringId::new(METADATA_STRING_ID);
self.get(id)
}
}
Expand Down
9 changes: 6 additions & 3 deletions analyzeme/src/testing_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ fn generate_profiling_data<S: SerializationSink>(
) -> Vec<Event<'static>> {
let profiler = Arc::new(Profiler::<S>::new(Path::new(filestem)).unwrap());

let event_id_reserved = StringId::reserved(42);
let event_id_virtual = StringId::new_virtual(42);

let event_ids = vec![
(
profiler.alloc_string("Generic"),
profiler.alloc_string("SomeGenericActivity"),
),
(profiler.alloc_string("Query"), event_id_reserved),
(profiler.alloc_string("Query"), event_id_virtual),
];

// This and event_ids have to match!
Expand Down Expand Up @@ -73,7 +73,10 @@ fn generate_profiling_data<S: SerializationSink>(

// An example of allocating the string contents of an event id that has
// already been used
profiler.alloc_string_with_reserved_id(event_id_reserved, "SomeQuery");
profiler.map_virtual_to_concrete_string(
event_id_virtual,
profiler.alloc_string("SomeQuery")
);

expected_events
}
Expand Down
2 changes: 1 addition & 1 deletion measureme/src/file_header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::serialization::SerializationSink;
use byteorder::{ByteOrder, LittleEndian};
use std::error::Error;

pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3;
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 4;
pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";
pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD";
pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI";
Expand Down
20 changes: 15 additions & 5 deletions measureme/src/profiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,21 @@ impl<S: SerializationSink> Profiler<S> {
}

#[inline(always)]
pub fn alloc_string_with_reserved_id<STR: SerializableString + ?Sized>(
pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) {
self.string_table
.map_virtual_to_concrete_string(virtual_id, concrete_id);
}

#[inline(always)]
pub fn bulk_map_virtual_to_single_concrete_string<I>(
&self,
id: StringId,
s: &STR,
) -> StringId {
self.string_table.alloc_with_reserved_id(id, s)
virtual_ids: I,
concrete_id: StringId,
) where
I: Iterator<Item = StringId> + ExactSizeIterator,
{
self.string_table
.bulk_map_virtual_to_single_concrete_string(virtual_ids, concrete_id);
}

#[inline(always)]
Expand All @@ -92,6 +101,7 @@ impl<S: SerializationSink> Profiler<S> {

/// Creates a "start" event and returns a `TimingGuard` that will create
/// the corresponding "end" event when it is dropped.
#[inline]
pub fn start_recording_interval_event<'a>(
&'a self,
event_kind: StringId,
Expand Down
Loading

0 comments on commit 47ca752

Please sign in to comment.