diff --git a/analyzeme/src/stringtable.rs b/analyzeme/src/stringtable.rs index 8dae55c..86fc99d 100644 --- a/analyzeme/src/stringtable.rs +++ b/analyzeme/src/stringtable.rs @@ -12,13 +12,9 @@ use std::borrow::Cow; use std::error::Error; use memchr::memchr; -// See module-level documentation for more information on the encoding. -const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000; -const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000; - fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) { ( - StringId::reserved(LittleEndian::read_u32(&bytes[0..4])), + StringId::new(LittleEndian::read_u32(&bytes[0..4])), Addr(LittleEndian::read_u32(&bytes[4..8])), ) } @@ -29,12 +25,29 @@ pub struct StringRef<'st> { table: &'st StringTable, } +// This is the text we emit when encountering a virtual string ID that cannot +// be resolved. +const UNKNOWN_STRING: &str = ""; + impl<'st> StringRef<'st> { + + /// Expands the StringRef into an actual string. This method will + /// avoid allocating a `String` if it can instead return a `&str` pointing + /// into the raw string table data. pub fn to_string(&self) -> Cow<'st, str> { - // Try to avoid the allocation, which we can do if this is a - // [value, 0xFF] entry. - let addr = self.table.index[&self.id]; + let addr = match self.get_addr() { + Ok(addr) => addr, + Err(_) => { + return Cow::from(UNKNOWN_STRING) + } + }; + + // Try to avoid the allocation, which we can do if this is + // + // - a string with a single value component (`[value, 0xFF]`) or + // - a string with a single reference component (`[string_id, 0xFF]`) + let pos = addr.as_usize(); let slice_to_search = &self.table.string_data[pos..]; @@ -43,12 +56,27 @@ impl<'st> StringRef<'st> { // is super fast. let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap(); + // Check if this is a string containing a single StringId component + let first_byte = self.table.string_data[pos]; + const STRING_ID_SIZE: usize = std::mem::size_of::(); + if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) { + let id = decode_string_id_from_data(&self.table.string_data[pos..pos+STRING_ID_SIZE]); + return StringRef { + id, + table: self.table, + }.to_string(); + } + // Decode the bytes until the terminator. If there is a string id in // between somewhere this will fail, and we fall back to the allocating // path. if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) { Cow::from(s) } else { + // This is the slow path where we actually allocate a `String` on + // the heap and expand into that. If you suspect that there is a + // bug in the fast path above, you can easily check if always taking + // the slow path fixes the issue. let mut output = String::new(); self.write_to_string(&mut output); Cow::from(output) @@ -56,7 +84,15 @@ impl<'st> StringRef<'st> { } pub fn write_to_string(&self, output: &mut String) { - let addr = self.table.index[&self.id]; + + let addr = match self.get_addr() { + Ok(addr) => addr, + Err(_) => { + output.push_str(UNKNOWN_STRING); + return + } + }; + let mut pos = addr.as_usize(); loop { @@ -64,15 +100,9 @@ impl<'st> StringRef<'st> { if byte == TERMINATOR { return; - } else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE { - // This is a string-id - let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]); - - // Mask off the `0b10` prefix - let id = id & STRING_ID_MASK; - + } else if is_utf8_continuation_byte(byte) { let string_ref = StringRef { - id: StringId::reserved(id), + id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]), table: self.table, }; @@ -87,6 +117,32 @@ impl<'st> StringRef<'st> { } } } + + fn get_addr(&self) -> Result { + if self.id.is_virtual() { + match self.table.index.get(&self.id) { + Some(&addr) => Ok(addr), + None => Err(()), + } + } else { + Ok(self.id.to_addr()) + } + } +} + +fn is_utf8_continuation_byte(byte: u8) -> bool { + // See module-level documentation for more information on the encoding. + const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000; + const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000; + (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE +} + +// String IDs in the table data are encoded in big endian format, while string +// IDs in the index are encoded in little endian format. Don't mix the two up. +fn decode_string_id_from_data(bytes: &[u8]) -> StringId { + let id = BigEndian::read_u32(&bytes[0..4]); + // Mask off the `0b10` prefix + StringId::new(id & STRING_ID_MASK) } // Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`. @@ -181,7 +237,7 @@ impl StringTable { } pub fn get_metadata<'a>(&'a self) -> StringRef<'a> { - let id = StringId::reserved(METADATA_STRING_ID); + let id = StringId::new(METADATA_STRING_ID); self.get(id) } } diff --git a/analyzeme/src/testing_common.rs b/analyzeme/src/testing_common.rs index 23a2aae..976f7a3 100644 --- a/analyzeme/src/testing_common.rs +++ b/analyzeme/src/testing_common.rs @@ -26,14 +26,14 @@ fn generate_profiling_data( ) -> Vec> { let profiler = Arc::new(Profiler::::new(Path::new(filestem)).unwrap()); - let event_id_reserved = StringId::reserved(42); + let event_id_virtual = StringId::new_virtual(42); let event_ids = vec![ ( profiler.alloc_string("Generic"), profiler.alloc_string("SomeGenericActivity"), ), - (profiler.alloc_string("Query"), event_id_reserved), + (profiler.alloc_string("Query"), event_id_virtual), ]; // This and event_ids have to match! @@ -73,7 +73,10 @@ fn generate_profiling_data( // An example of allocating the string contents of an event id that has // already been used - profiler.alloc_string_with_reserved_id(event_id_reserved, "SomeQuery"); + profiler.map_virtual_to_concrete_string( + event_id_virtual, + profiler.alloc_string("SomeQuery") + ); expected_events } diff --git a/measureme/src/file_header.rs b/measureme/src/file_header.rs index e32ef59..835c623 100644 --- a/measureme/src/file_header.rs +++ b/measureme/src/file_header.rs @@ -6,7 +6,7 @@ use crate::serialization::SerializationSink; use byteorder::{ByteOrder, LittleEndian}; use std::error::Error; -pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3; +pub const CURRENT_FILE_FORMAT_VERSION: u32 = 4; pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES"; pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD"; pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI"; diff --git a/measureme/src/profiler.rs b/measureme/src/profiler.rs index 58a6b7d..cf96257 100644 --- a/measureme/src/profiler.rs +++ b/measureme/src/profiler.rs @@ -68,12 +68,21 @@ impl Profiler { } #[inline(always)] - pub fn alloc_string_with_reserved_id( + pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) { + self.string_table + .map_virtual_to_concrete_string(virtual_id, concrete_id); + } + + #[inline(always)] + pub fn bulk_map_virtual_to_single_concrete_string( &self, - id: StringId, - s: &STR, - ) -> StringId { - self.string_table.alloc_with_reserved_id(id, s) + virtual_ids: I, + concrete_id: StringId, + ) where + I: Iterator + ExactSizeIterator, + { + self.string_table + .bulk_map_virtual_to_single_concrete_string(virtual_ids, concrete_id); } #[inline(always)] @@ -92,6 +101,7 @@ impl Profiler { /// Creates a "start" event and returns a `TimingGuard` that will create /// the corresponding "end" event when it is dropped. + #[inline] pub fn start_recording_interval_event<'a>( &'a self, event_kind: StringId, diff --git a/measureme/src/raw_event.rs b/measureme/src/raw_event.rs index db7c28d..c9e854a 100644 --- a/measureme/src/raw_event.rs +++ b/measureme/src/raw_event.rs @@ -116,7 +116,7 @@ impl RawEvent { { // We always emit data as little endian, which we have to do // manually on big endian targets. - use byteorder::{LittleEndian, ByteOrder}; + use byteorder::{ByteOrder, LittleEndian}; LittleEndian::write_u32(&mut bytes[0..], self.event_kind.as_u32()); LittleEndian::write_u32(&mut bytes[4..], self.event_id.as_u32()); @@ -146,7 +146,7 @@ impl RawEvent { #[cfg(target_endian = "big")] { - use byteorder::{LittleEndian, ByteOrder}; + use byteorder::{ByteOrder, LittleEndian}; RawEvent { event_kind: StringId::reserved(LittleEndian::read_u32(&bytes[0..])), event_id: StringId::reserved(LittleEndian::read_u32(&bytes[4..])), @@ -162,8 +162,8 @@ impl RawEvent { impl Default for RawEvent { fn default() -> Self { RawEvent { - event_kind: StringId::reserved(0), - event_id: StringId::reserved(0), + event_kind: StringId::INVALID, + event_id: StringId::INVALID, thread_id: 0, start_time_lower: 0, end_time_lower: 0, @@ -184,22 +184,19 @@ mod tests { #[test] fn is_instant() { - assert!( - RawEvent::new_instant(StringId::reserved(0), StringId::reserved(0), 987, 0,) - .is_instant() - ); + assert!(RawEvent::new_instant(StringId::INVALID, StringId::INVALID, 987, 0,).is_instant()); assert!(RawEvent::new_instant( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 987, MAX_INSTANT_TIMESTAMP, ) .is_instant()); assert!(!RawEvent::new_interval( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 987, 0, MAX_INTERVAL_TIMESTAMP, @@ -211,8 +208,8 @@ mod tests { #[should_panic] fn invalid_instant_timestamp() { let _ = RawEvent::new_instant( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 123, // timestamp too large MAX_INSTANT_TIMESTAMP + 1, @@ -223,8 +220,8 @@ mod tests { #[should_panic] fn invalid_start_timestamp() { let _ = RawEvent::new_interval( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 123, // start timestamp too large MAX_INTERVAL_TIMESTAMP + 1, @@ -236,8 +233,8 @@ mod tests { #[should_panic] fn invalid_end_timestamp() { let _ = RawEvent::new_interval( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 123, 0, // end timestamp too large @@ -249,8 +246,8 @@ mod tests { #[should_panic] fn invalid_end_timestamp2() { let _ = RawEvent::new_interval( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 123, 0, INSTANT_TIMESTAMP_MARKER, @@ -261,8 +258,8 @@ mod tests { #[should_panic] fn start_greater_than_end_timestamp() { let _ = RawEvent::new_interval( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 123, // start timestamp greater than end timestamp 1, @@ -273,15 +270,15 @@ mod tests { #[test] fn start_equal_to_end_timestamp() { // This is allowed, make sure we don't panic - let _ = RawEvent::new_interval(StringId::reserved(0), StringId::reserved(0), 123, 1, 1); + let _ = RawEvent::new_interval(StringId::INVALID, StringId::INVALID, 123, 1, 1); } #[test] fn interval_timestamp_decoding() { // Check the upper limits let e = RawEvent::new_interval( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 1234, MAX_INTERVAL_TIMESTAMP, MAX_INTERVAL_TIMESTAMP, @@ -291,15 +288,15 @@ mod tests { assert_eq!(e.end_nanos(), MAX_INTERVAL_TIMESTAMP); // Check the lower limits - let e = RawEvent::new_interval(StringId::reserved(0), StringId::reserved(0), 1234, 0, 0); + let e = RawEvent::new_interval(StringId::INVALID, StringId::INVALID, 1234, 0, 0); assert_eq!(e.start_nanos(), 0); assert_eq!(e.end_nanos(), 0); // Check that end does not bleed into start let e = RawEvent::new_interval( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 1234, 0, MAX_INTERVAL_TIMESTAMP, @@ -310,8 +307,8 @@ mod tests { // Test some random values let e = RawEvent::new_interval( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 1234, 0x1234567890, 0x1234567890A, @@ -324,15 +321,14 @@ mod tests { #[test] fn instant_timestamp_decoding() { assert_eq!( - RawEvent::new_instant(StringId::reserved(0), StringId::reserved(0), 987, 0,) - .start_nanos(), + RawEvent::new_instant(StringId::INVALID, StringId::INVALID, 987, 0,).start_nanos(), 0 ); assert_eq!( RawEvent::new_instant( - StringId::reserved(0), - StringId::reserved(0), + StringId::INVALID, + StringId::INVALID, 987, MAX_INSTANT_TIMESTAMP, ) diff --git a/measureme/src/stringtable.rs b/measureme/src/stringtable.rs index e4efab8..fa513e4 100644 --- a/measureme/src/stringtable.rs +++ b/measureme/src/stringtable.rs @@ -42,9 +42,11 @@ //! Each string in the table is referred to via a `StringId`. `StringId`s may //! be generated in two ways: //! -//! 1. Calling `StringTable::alloc()` which returns the `StringId` for the -//! allocated string. -//! 2. Calling `StringTable::alloc_with_reserved_id()` and `StringId::reserved()`. +//! 1. Calling `StringTableBuilder::alloc()` which returns the `StringId` for +//! the allocated string. +//! 2. Calling `StringId::new_virtual()` to create a "virtual" `StringId` that +//! later can be mapped to an actual string via +//! `StringTableBuilder::map_virtual_to_concrete_string()`. //! //! String IDs allow you to deduplicate strings by allocating a string //! once and then referring to it by id over and over. This is a useful trick @@ -53,10 +55,10 @@ //! //! `StringId`s are partitioned according to type: //! -//! > [0 .. MAX_PRE_RESERVED_STRING_ID, METADATA_STRING_ID, .. ] +//! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ] //! -//! From `0` to `MAX_PRE_RESERVED_STRING_ID` are the allowed values for reserved strings. -//! After `MAX_PRE_RESERVED_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used +//! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings. +//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used //! internally by `measureme` to record additional metadata about the profiling session. //! After `METADATA_STRING_ID` are all other `StringId` values. //! @@ -66,25 +68,53 @@ use crate::file_header::{ }; use crate::serialization::{Addr, SerializationSink}; use byteorder::{BigEndian, ByteOrder, LittleEndian}; -use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::Arc; -/// A `StringId` is used to identify a string in the `StringTable`. +/// A `StringId` is used to identify a string in the `StringTable`. It is +/// either a regular `StringId`, meaning that it contains the absolute address +/// of a string within the string table data. Or it is "virtual", which means +/// that the address it points to is resolved via the string table index data, +/// that maps virtual `StringId`s to addresses. #[derive(Clone, Copy, Eq, PartialEq, Debug, Hash)] #[repr(C)] pub struct StringId(u32); impl StringId { + pub const INVALID: StringId = StringId(INVALID_STRING_ID); + #[inline] - pub fn reserved(id: u32) -> StringId { - assert!(id == id & STRING_ID_MASK); + pub fn new(id: u32) -> StringId { + assert!(id <= MAX_STRING_ID); StringId(id) } + #[inline] + pub fn new_virtual(id: u32) -> StringId { + assert!(id <= MAX_USER_VIRTUAL_STRING_ID); + StringId(id) + } + + #[inline] + pub fn is_virtual(self) -> bool { + self.0 <= METADATA_STRING_ID + } + #[inline] pub fn as_u32(self) -> u32 { self.0 } + + #[inline] + pub fn from_addr(addr: Addr) -> StringId { + let id = addr.0 + FIRST_REGULAR_STRING_ID; + StringId::new(id) + } + + #[inline] + pub fn to_addr(self) -> Addr { + assert!(self.0 >= FIRST_REGULAR_STRING_ID); + Addr(self.0 - FIRST_REGULAR_STRING_ID) + } } // See module-level documentation for more information on the encoding. @@ -94,17 +124,21 @@ pub const TERMINATOR: u8 = 0xFF; pub const MAX_STRING_ID: u32 = 0x3FFF_FFFF; pub const STRING_ID_MASK: u32 = 0x3FFF_FFFF; -/// The maximum id value a prereserved string may be. -const MAX_PRE_RESERVED_STRING_ID: u32 = MAX_STRING_ID / 2; +/// The maximum id value a virtual string may be. +const MAX_USER_VIRTUAL_STRING_ID: u32 = 100_000_000; /// The id of the profile metadata string entry. -pub const METADATA_STRING_ID: u32 = MAX_PRE_RESERVED_STRING_ID + 1; +pub const METADATA_STRING_ID: u32 = MAX_USER_VIRTUAL_STRING_ID + 1; + +/// Some random string ID that we make sure cannot be generated or assigned to. +const INVALID_STRING_ID: u32 = METADATA_STRING_ID + 1; + +pub const FIRST_REGULAR_STRING_ID: u32 = INVALID_STRING_ID + 1; /// Write-only version of the string table pub struct StringTableBuilder { data_sink: Arc, index_sink: Arc, - id_counter: AtomicU32, // initialized to METADATA_STRING_ID + 1 } /// Anything that implements `SerializableString` can be written to a @@ -233,41 +267,63 @@ impl StringTableBuilder { StringTableBuilder { data_sink, index_sink, - id_counter: AtomicU32::new(METADATA_STRING_ID + 1), } } - pub fn alloc_with_reserved_id( - &self, - id: StringId, - s: &STR, - ) -> StringId { - assert!(id.0 <= MAX_PRE_RESERVED_STRING_ID); - self.alloc_unchecked(id, s); - id + /// Creates a mapping so that `virtual_id` will resolve to the contents of + /// `concrete_id` when reading the string table. + pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) { + // This assertion does not use `is_virtual` on purpose because that + // would also allow to overwrite `METADATA_STRING_ID`. + assert!(virtual_id.0 <= MAX_USER_VIRTUAL_STRING_ID); + serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr()); } - pub(crate) fn alloc_metadata(&self, s: &STR) -> StringId { - let id = StringId(METADATA_STRING_ID); - self.alloc_unchecked(id, s); - id + pub fn bulk_map_virtual_to_single_concrete_string( + &self, + virtual_ids: I, + concrete_id: StringId, + ) where + I: Iterator + ExactSizeIterator, + { + // TODO: Index data encoding could have special bulk mode that assigns + // multiple StringIds to the same addr, so we don't have to repeat + // the `concrete_id` over and over. + + type MappingEntry = [u32; 2]; + assert!(std::mem::size_of::() == 8); + + let to_addr_le = concrete_id.to_addr().0.to_le(); + + let serialized: Vec = virtual_ids + .map(|from| { + let id = from.0; + assert!(id <= MAX_USER_VIRTUAL_STRING_ID); + [id.to_le(), to_addr_le] + }) + .collect(); + + let num_bytes = serialized.len() * std::mem::size_of::(); + let byte_ptr = serialized.as_ptr() as *const u8; + + let bytes = unsafe { std::slice::from_raw_parts(byte_ptr, num_bytes) }; + + self.index_sink.write_bytes_atomic(bytes); } - pub fn alloc(&self, s: &STR) -> StringId { - let id = StringId(self.id_counter.fetch_add(1, Ordering::SeqCst)); - assert!(id.0 > METADATA_STRING_ID); - assert!(id.0 <= MAX_STRING_ID); - self.alloc_unchecked(id, s); - id + pub(crate) fn alloc_metadata(&self, s: &STR) { + let concrete_id = self.alloc(s); + let virtual_id = StringId(METADATA_STRING_ID); + assert!(virtual_id.is_virtual()); + serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr()); } - #[inline] - fn alloc_unchecked(&self, id: StringId, s: &STR) { + pub fn alloc(&self, s: &STR) -> StringId { let size_in_bytes = s.serialized_size(); let addr = self.data_sink.write_atomic(size_in_bytes, |mem| { s.serialize(mem); }); - serialize_index_entry(&*self.index_sink, id, addr); + StringId::from_addr(addr) } }