Skip to content

Commit

Permalink
Streaming git read, ability to compute objects newly added in a commit
Browse files Browse the repository at this point in the history
  • Loading branch information
adamziel committed Dec 30, 2024
1 parent 63ed39a commit 7e833b6
Show file tree
Hide file tree
Showing 2 changed files with 233 additions and 52 deletions.
283 changes: 232 additions & 51 deletions packages/playground/data-liberation/src/git/WP_Git_Cached_Index.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ class WP_Git_Cached_Index {

private $oid;
private $type;
private $length;
private $contents;
private $content_inflate_handle;
private $object_content_chunk;
private $called_next_object_chunk;
private $buffered_object_content;
private $parsed_commit;
private $parsed_tree;
private $last_error;
Expand All @@ -31,33 +33,142 @@ public function __construct(
}
}

/**
* @TODO: Streaming read. Don't load everything into memory.
*/
public function read_object($oid) {
// Reset the object state
$this->oid = null;
$this->type = null;
$this->length = null;
$this->contents = null;
$this->parsed_commit = null;
$this->parsed_tree = null;
$this->reset();

$object_path = $this->get_object_path($oid);
if(!$this->fs->is_file($object_path)) {
return false;
}

$contents = $this->fs->read_file($this->get_object_path($oid));
$contents = WP_Git_Pack_Processor::inflate($contents);
$type_length = strpos($contents, ' ');
$this->oid = $oid;
$this->type = substr($contents, 0, $type_length);
$this->length = substr($contents, $type_length + 1, strpos($contents, "\x00", $type_length) - $type_length - 1);
$this->contents = substr($contents, strpos($contents, "\x00", $type_length) + 1);
if($this->type === WP_Git_Pack_Processor::OBJECT_NAMES[WP_Git_Pack_Processor::OBJECT_TYPE_COMMIT]) {
$this->parsed_commit = WP_Git_Pack_Processor::parse_commit_message($this->contents);
} else if($this->type === WP_Git_Pack_Processor::OBJECT_NAMES[WP_Git_Pack_Processor::OBJECT_TYPE_TREE]) {
$this->parsed_tree = WP_Git_Pack_Processor::parse_tree_bytes($this->contents);
if(!$this->open_object_stream()) {
return false;
}

// Read the object header and initialize the internal state
// for the specific get_* methods below.
$header = false;
$content = '';
while($this->next_object_chunk()) {
$content .= $this->get_object_content_chunk();
$null_byte_position = strpos($content, "\x00");
if($null_byte_position === false) {
continue;
}
$header = substr($content, 0, $null_byte_position);
break;
}

if(false === $header) {
$this->last_error = 'Failed to read the object header';
return false;
}

$this->object_content_chunk = substr($content, strlen($header) + 1);

// Parse the header
$type_length = strpos($header, ' ');
$type = substr($header, 0, $type_length);
switch($type) {
case WP_Git_Pack_Processor::OBJECT_NAMES[WP_Git_Pack_Processor::OBJECT_TYPE_BLOB]:
$this->type = WP_Git_Pack_Processor::OBJECT_TYPE_BLOB;
break;
case WP_Git_Pack_Processor::OBJECT_NAMES[WP_Git_Pack_Processor::OBJECT_TYPE_TREE]:
$this->type = WP_Git_Pack_Processor::OBJECT_TYPE_TREE;
break;
case WP_Git_Pack_Processor::OBJECT_NAMES[WP_Git_Pack_Processor::OBJECT_TYPE_COMMIT]:
$this->type = WP_Git_Pack_Processor::OBJECT_TYPE_COMMIT;
break;
default:
$this->last_error = 'Invalid object type: ' . $type;
return false;
}
return true;
}

public function get_type() {
return $this->type;
}

public function get_length() {
return $this->fs->get_streamed_file_length();
}

private function open_object_stream() {
$this->content_inflate_handle = inflate_init(ZLIB_ENCODING_DEFLATE);
if(!$this->content_inflate_handle) {
$this->last_error = 'Failed to initialize inflate handle';
return false;
}
if(!$this->fs->open_file_stream($this->get_object_path($this->oid))) {
return false;
}
return true;
}

public function next_object_chunk() {
if(false === $this->fs->next_file_chunk()) {
$this->last_error = $this->fs->get_error_message();
return false;
}
$this->called_next_object_chunk = true;
$chunk = $this->fs->get_file_chunk();
$next_chunk = inflate_add($this->content_inflate_handle, $chunk);
if(false === $next_chunk) {
$this->last_error = 'Failed to inflate chunk';
$this->close_object_stream();
return false;
}
$this->object_content_chunk = $next_chunk;
return true;
}

public function get_object_content_chunk() {
return $this->object_content_chunk;
}

private function close_object_stream() {
$this->fs->close_file_stream();
$this->content_inflate_handle = null;
return true;
}

public function get_parsed_commit() {
if(null === $this->parsed_commit) {
$commit_contents = $this->read_entire_object_contents();
$this->parsed_commit = WP_Git_Pack_Processor::parse_commit_message($commit_contents);
}
return $this->parsed_commit;
}

public function get_parsed_tree() {
if(null === $this->parsed_tree) {
$tree_contents = $this->read_entire_object_contents();
$this->parsed_tree = WP_Git_Pack_Processor::parse_tree_bytes($tree_contents);
}
return $this->parsed_tree;
}

public function read_entire_object_contents() {
// If we've advanced the stream, we can't reuse it to read the entire
// object anymore. Let's re-initialize the stream.
if($this->called_next_object_chunk) {
$this->read_object($this->oid);
}
if(null !== $this->buffered_object_content) {
return $this->buffered_object_content;
}
// Load the entire object into memory and keep the result
// for later use. We'll likely need it again before we're
// done with the current object.
$this->buffered_object_content = $this->object_content_chunk;
while($this->next_object_chunk()) {
$this->buffered_object_content .= $this->get_object_content_chunk();
}
return $this->buffered_object_content;
}

public function oid_exists($oid) {
return $this->fs->is_file($this->get_object_path($oid));
}
Expand All @@ -68,7 +179,10 @@ public function read_by_path($path, $root_tree_oid=null) {
if(false === $this->read_object($head_oid)) {
return false;
}
$root_tree_oid = $this->get_commit_tree_oid();
$root_tree_oid = $this->get_parsed_commit()['tree'] ?? null;
}
if($root_tree_oid === null) {
return false;
}
if(false === $this->read_object($root_tree_oid)) {
return false;
Expand All @@ -93,41 +207,90 @@ public function read_by_path($path, $root_tree_oid=null) {
return true;
}

public function get_descendants($tree_oid) {
if(false === $this->read_object($tree_oid)) {
return [];
public function get_last_error() {
return $this->last_error;
}

public function iterate_objects_added_in($new_tree_oid, $old_tree_oid=null) {
if($new_tree_oid === $old_tree_oid) {
return false;
}
foreach ($this->parsed_tree as $object) {
if ($object['mode'] === WP_Git_Pack_Processor::FILE_MODE_DIRECTORY) {
yield from $this->get_descendants($object['sha1']);
} else {
yield $object;
}

// Resolve the actual tree oid if $new_tree_oid is a commit
if(false === $this->read_object($new_tree_oid)) {
$this->last_error = 'Failed to read object: ' . $new_tree_oid;
return false;
}
if($this->get_type() === WP_Git_Pack_Processor::OBJECT_TYPE_COMMIT) {
// yield the commit object itself
$parsed_commit = $this->get_parsed_commit();
yield $parsed_commit['tree'];
$new_tree_oid = $parsed_commit['tree'];
}
}

public function get_type() {
return $this->type;
}
// Resolve the actual tree oid if $old_tree_oid is a commit
if($old_tree_oid) {
if(false === $this->read_object($old_tree_oid)) {
$this->last_error = 'Failed to read object: ' . $old_tree_oid;
return false;
}
if($this->get_type() === WP_Git_Pack_Processor::OBJECT_TYPE_COMMIT) {
$old_tree_oid = $this->get_parsed_commit()['tree'];
}
}

public function get_length() {
return $this->length;
}
$stack = [[$new_tree_oid, $old_tree_oid]];

while(!empty($stack)) {
list($current_new_oid, $current_old_oid) = array_pop($stack);

if(false === $this->read_object($current_new_oid)) {
$this->last_error = 'Failed to read object: ' . $current_new_oid;
return false;
}
$new_tree = $this->get_parsed_tree();

$old_tree = [];
if($current_old_oid) {
if(false === $this->read_object($current_old_oid)) {
$this->last_error = 'Failed to read object: ' . $current_old_oid;
return false;
}
$old_tree = $this->get_parsed_tree();
}

public function get_contents() {
return $this->contents;
}
foreach($new_tree as $name => $object) {
// Object is new
if(!isset($old_tree[$name])) {
if(false === $this->read_object($object['sha1'])) {
$this->last_error = 'Failed to read object: ' . $object['sha1'];
return false;
}
yield $object['sha1'];
if($object['mode'] === WP_Git_Pack_Processor::FILE_MODE_DIRECTORY) {
$stack[] = [$object['sha1'], null];
}
continue;
}

public function get_parsed_commit() {
return $this->parsed_commit;
}
// Object is unchanged
if($object['sha1'] === $old_tree[$name]['sha1']) {
continue;
}

public function get_commit_tree_oid() {
return $this->parsed_commit['tree'];
}
if(false === $this->read_object($object['sha1'])) {
$this->last_error = 'Failed to read object: ' . $object['sha1'];
return false;
}

yield $object['sha1'];

public function get_parsed_tree() {
return $this->parsed_tree;
if($object['mode'] === WP_Git_Pack_Processor::FILE_MODE_DIRECTORY) {
// Object is a changed directory - add to stack for recursive processing
$stack[] = [$object['sha1'], $old_tree[$name]['sha1']];
}
}
}
}

public function set_ref_head($ref, $oid) {
Expand Down Expand Up @@ -205,6 +368,7 @@ public function commit($changeset, $commit_meta=[]) {

// Process blob updates
foreach ($updates as $path => $content) {
$path = '/' . ltrim($path, '/');
$blob_oid = $this->add_object(WP_Git_Pack_Processor::OBJECT_TYPE_BLOB, $content);
$this->mark_tree_path_changed($changed_trees, dirname($path));
$changed_trees[dirname($path)]['entries'][basename($path)] = [
Expand All @@ -216,6 +380,7 @@ public function commit($changeset, $commit_meta=[]) {

// Process deletes
foreach ($deletes as $path) {
$path = '/' . ltrim($path, '/');
if (!$this->read_by_path(dirname($path))) {
_doing_it_wrong(__METHOD__, 'File not found in HEAD: ' . $path, '1.0.0');
return false;
Expand All @@ -226,6 +391,8 @@ public function commit($changeset, $commit_meta=[]) {

// Process tree moves
foreach ($move_trees as $old_path => $new_path) {
$old_path = '/' . ltrim($old_path, '/');
$new_path = '/' . ltrim($new_path, '/');
if (!$this->read_by_path($old_path)) {
_doing_it_wrong(__METHOD__, 'Path not found in HEAD: ' . $old_path, '1.0.0');
return false;
Expand All @@ -244,7 +411,7 @@ public function commit($changeset, $commit_meta=[]) {
// Process trees bottom-up recursively
$root_tree_oid = $this->commit_tree('/', $changed_trees);

// Create commit object
// Create a new commit object
$commit_message = [];
$commit_message[] = "tree " . $root_tree_oid;
if($this->get_ref_head('HEAD')) {
Expand All @@ -262,9 +429,22 @@ public function commit($changeset, $commit_meta=[]) {
$this->last_error = 'Failed to set HEAD';
return false;
}
$this->reset();
return $commit_oid;
}

private function reset() {
$this->close_object_stream();
$this->oid = null;
$this->type = null;
$this->parsed_commit = null;
$this->parsed_tree = null;
$this->called_next_object_chunk = false;
$this->buffered_object_content = null;
$this->object_content_chunk = null;
$this->last_error = null;
}

private function mark_tree_path_changed(&$changed_trees, $path) {
while ($path !== '/') {
if (!isset($changed_trees[$path])) {
Expand Down Expand Up @@ -312,4 +492,5 @@ private function commit_tree($path, $changed_trees) {
);
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ static public function parse_commit_message($commit_message) {
$parsed = [];
foreach($lines as $k => $line) {
if(!trim($line)) {
$parsed['message'] = array_slice($lines, $k + 1);
$parsed['message'] = implode("\n", array_slice($lines, $k + 1));
break;
}
$type_len = strpos($line, ' ');
Expand Down

0 comments on commit 7e833b6

Please sign in to comment.