Skip to content

Commit

Permalink
Merge pull request #98 from michaelwoerister/virtual-instead-of-reser…
Browse files Browse the repository at this point in the history
…ved-string-ids

Virtual instead of reserved string ids
  • Loading branch information
wesleywiser authored Dec 10, 2019
2 parents e1127aa + 47ca752 commit 8d2d4fd
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 97 deletions.
92 changes: 74 additions & 18 deletions analyzeme/src/stringtable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,9 @@ use std::borrow::Cow;
use std::error::Error;
use memchr::memchr;

// See module-level documentation for more information on the encoding.
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;

fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
(
StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
StringId::new(LittleEndian::read_u32(&bytes[0..4])),
Addr(LittleEndian::read_u32(&bytes[4..8])),
)
}
Expand All @@ -29,12 +25,29 @@ pub struct StringRef<'st> {
table: &'st StringTable,
}

// This is the text we emit when encountering a virtual string ID that cannot
// be resolved.
const UNKNOWN_STRING: &str = "<unknown>";

impl<'st> StringRef<'st> {

/// Expands the StringRef into an actual string. This method will
/// avoid allocating a `String` if it can instead return a `&str` pointing
/// into the raw string table data.
pub fn to_string(&self) -> Cow<'st, str> {

// Try to avoid the allocation, which we can do if this is a
// [value, 0xFF] entry.
let addr = self.table.index[&self.id];
let addr = match self.get_addr() {
Ok(addr) => addr,
Err(_) => {
return Cow::from(UNKNOWN_STRING)
}
};

// Try to avoid the allocation, which we can do if this is
//
// - a string with a single value component (`[value, 0xFF]`) or
// - a string with a single reference component (`[string_id, 0xFF]`)

let pos = addr.as_usize();
let slice_to_search = &self.table.string_data[pos..];

Expand All @@ -43,36 +56,53 @@ impl<'st> StringRef<'st> {
// is super fast.
let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();

// Check if this is a string containing a single StringId component
let first_byte = self.table.string_data[pos];
const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
let id = decode_string_id_from_data(&self.table.string_data[pos..pos+STRING_ID_SIZE]);
return StringRef {
id,
table: self.table,
}.to_string();
}

// Decode the bytes until the terminator. If there is a string id in
// between somewhere this will fail, and we fall back to the allocating
// path.
if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
Cow::from(s)
} else {
// This is the slow path where we actually allocate a `String` on
// the heap and expand into that. If you suspect that there is a
// bug in the fast path above, you can easily check if always taking
// the slow path fixes the issue.
let mut output = String::new();
self.write_to_string(&mut output);
Cow::from(output)
}
}

pub fn write_to_string(&self, output: &mut String) {
let addr = self.table.index[&self.id];

let addr = match self.get_addr() {
Ok(addr) => addr,
Err(_) => {
output.push_str(UNKNOWN_STRING);
return
}
};

let mut pos = addr.as_usize();

loop {
let byte = self.table.string_data[pos];

if byte == TERMINATOR {
return;
} else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
// This is a string-id
let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);

// Mask off the `0b10` prefix
let id = id & STRING_ID_MASK;

} else if is_utf8_continuation_byte(byte) {
let string_ref = StringRef {
id: StringId::reserved(id),
id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
table: self.table,
};

Expand All @@ -87,6 +117,32 @@ impl<'st> StringRef<'st> {
}
}
}

fn get_addr(&self) -> Result<Addr, ()> {
if self.id.is_virtual() {
match self.table.index.get(&self.id) {
Some(&addr) => Ok(addr),
None => Err(()),
}
} else {
Ok(self.id.to_addr())
}
}
}

fn is_utf8_continuation_byte(byte: u8) -> bool {
// See module-level documentation for more information on the encoding.
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
(byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
}

// String IDs in the table data are encoded in big endian format, while string
// IDs in the index are encoded in little endian format. Don't mix the two up.
fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
let id = BigEndian::read_u32(&bytes[0..4]);
// Mask off the `0b10` prefix
StringId::new(id & STRING_ID_MASK)
}

// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
Expand Down Expand Up @@ -181,7 +237,7 @@ impl StringTable {
}

pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
let id = StringId::reserved(METADATA_STRING_ID);
let id = StringId::new(METADATA_STRING_ID);
self.get(id)
}
}
Expand Down
9 changes: 6 additions & 3 deletions analyzeme/src/testing_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ fn generate_profiling_data<S: SerializationSink>(
) -> Vec<Event<'static>> {
let profiler = Arc::new(Profiler::<S>::new(Path::new(filestem)).unwrap());

let event_id_reserved = StringId::reserved(42);
let event_id_virtual = StringId::new_virtual(42);

let event_ids = vec![
(
profiler.alloc_string("Generic"),
profiler.alloc_string("SomeGenericActivity"),
),
(profiler.alloc_string("Query"), event_id_reserved),
(profiler.alloc_string("Query"), event_id_virtual),
];

// This and event_ids have to match!
Expand Down Expand Up @@ -73,7 +73,10 @@ fn generate_profiling_data<S: SerializationSink>(

// An example of allocating the string contents of an event id that has
// already been used
profiler.alloc_string_with_reserved_id(event_id_reserved, "SomeQuery");
profiler.map_virtual_to_concrete_string(
event_id_virtual,
profiler.alloc_string("SomeQuery")
);

expected_events
}
Expand Down
2 changes: 1 addition & 1 deletion measureme/src/file_header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::serialization::SerializationSink;
use byteorder::{ByteOrder, LittleEndian};
use std::error::Error;

pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3;
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 4;
pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";
pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD";
pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI";
Expand Down
20 changes: 15 additions & 5 deletions measureme/src/profiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,21 @@ impl<S: SerializationSink> Profiler<S> {
}

#[inline(always)]
pub fn alloc_string_with_reserved_id<STR: SerializableString + ?Sized>(
pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) {
self.string_table
.map_virtual_to_concrete_string(virtual_id, concrete_id);
}

#[inline(always)]
pub fn bulk_map_virtual_to_single_concrete_string<I>(
&self,
id: StringId,
s: &STR,
) -> StringId {
self.string_table.alloc_with_reserved_id(id, s)
virtual_ids: I,
concrete_id: StringId,
) where
I: Iterator<Item = StringId> + ExactSizeIterator,
{
self.string_table
.bulk_map_virtual_to_single_concrete_string(virtual_ids, concrete_id);
}

#[inline(always)]
Expand All @@ -92,6 +101,7 @@ impl<S: SerializationSink> Profiler<S> {

/// Creates a "start" event and returns a `TimingGuard` that will create
/// the corresponding "end" event when it is dropped.
#[inline]
pub fn start_recording_interval_event<'a>(
&'a self,
event_kind: StringId,
Expand Down
Loading

0 comments on commit 8d2d4fd

Please sign in to comment.