mosaicml · knighton · Feb 14, 2024 · Jan 29, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Streaming benchmarking."""
 
 from benchmarks import compression as compression

diff --git a/benchmarks/backends/datagen.py b/benchmarks/backends/datagen.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Generate a synthetic dataset."""
 
 from typing import Dict, List, Tuple, TypeVar

diff --git a/benchmarks/backends/plot.py b/benchmarks/backends/plot.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Plot dataset iteration time."""
 
 import json

diff --git a/benchmarks/backends/read.py b/benchmarks/backends/read.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Benchmark dataset iteration time."""
 
 import json

diff --git a/benchmarks/backends/write.py b/benchmarks/backends/write.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Generate a synthetic dataset and serialize it using each Streaming format/backend."""
 
 import os

diff --git a/examples/__init__.py b/examples/__init__.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Example streaming datasets."""
 
 from examples import multimodal as multimodal

diff --git a/examples/multimodal/__init__.py b/examples/multimodal/__init__.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Example multimodal streaming datasets."""
 
 from examples.multimodal import laion400m as laion400m

diff --git a/examples/multimodal/laion400m/__init__.py b/examples/multimodal/laion400m/__init__.py
@@ -1,7 +1,4 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """LAION-400M streaming dataset example."""
diff --git a/examples/multimodal/webvid/__init__.py b/examples/multimodal/webvid/__init__.py
@@ -1,7 +1,4 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """WebVid streaming dataset example."""
diff --git a/examples/multimodal/webvid/read.py b/examples/multimodal/webvid/read.py
@@ -90,7 +90,7 @@ class StreamingOutsideDTWebVid(StreamingDataset):
     """Streaming WebVid dataset.
 
     Videos are stored "outside" the shards, as a file per video. The extra download happens in
-    _download_thread ("DT"), when the download thread prefetches the sample.
+    _fetch_thread ("DT"), when the download thread prefetches the sample.
 
     Args:
         extra_local (str, optional): Base destination of extra local sample downloads.
@@ -133,7 +133,7 @@ def get_item(self, idx: int) -> Any:
 
         return obj
 
-    def _download_thread(self, it: _Iterator) -> None:
+    def _fetch_thread(self, it: _Iterator) -> None:
         """Download the relevant shards in the background while we are being iterated.
 
         This thread is started at the beginning of each epoch, and exits either when out of samples
@@ -154,26 +154,26 @@ def _download_thread(self, it: _Iterator) -> None:
                 break
 
             # If we're out of samples this epoch, exit this thread because we are done downloading.
-            if it.prepare_index == it.total:
+            if it.fetch_index == it.total:
                 break
 
             # If we are requested to only pre-download so many samples, if we have as many or more
             # downloaded already, we wait and check again later.
             if self.predownload is not None:
-                samples_ahead = it.prepare_index - it.yield_index
+                samples_ahead = it.fetch_index - it.yield_index
                 if self.predownload <= samples_ahead:
                     sleep(TICK)
                     continue
 
             # If we hit -1, we skip.
-            sample_id = it.sample_ids[it.prepare_index]
+            sample_id = it.sample_ids[it.fetch_index]
             if sample_id == -1:
-                it.prepare_index += 1
+                it.fetch_index += 1
                 continue
 
             # Download and decompress the shard for this sample, if not already done.
             shard_id, _ = self.spanner[sample_id]
-            self.prepare_shard(shard_id, False)
+            self.fetch_shard(shard_id, False)
 
             # Predownload the sample's extra data.
             obj = super().get_item(sample_id)
@@ -185,7 +185,7 @@ def _download_thread(self, it: _Iterator) -> None:
                     download_file(remote, local, self.download_timeout)
 
             # Step forward one sample.
-            it.prepare_index += 1
+            it.fetch_index += 1
 
         # Note that we exited.
         it.on_exit()
diff --git a/examples/text/__init__.py b/examples/text/__init__.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Example text streaming datasets."""
 
 from examples.text import c4 as c4

diff --git a/examples/vision/__init__.py b/examples/vision/__init__.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Example computer vision streaming datasets."""
 
 from examples.vision import ade20k as ade20k

diff --git a/examples/vision/ade20k/__init__.py b/examples/vision/ade20k/__init__.py
@@ -1,7 +1,4 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """ADE20K streaming dataset example."""
diff --git a/examples/vision/cifar10/__init__.py b/examples/vision/cifar10/__init__.py
@@ -1,7 +1,4 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """CIFAR10 streaming dataset example."""
diff --git a/examples/vision/cifar10/read.py b/examples/vision/cifar10/read.py
@@ -9,7 +9,7 @@
 
 from typing import Any, Dict
 
-from streaming.vision import StreamingVisionDataset
+from streaming.modality.vision import StreamingVisionDataset
 
 __all__ = ['StreamingCIFAR10']
 

diff --git a/examples/vision/cifar10/write.py b/examples/vision/cifar10/write.py
@@ -7,8 +7,8 @@
 
 from torchvision.datasets import CIFAR10
 
+from streaming.modality.vision import convert_image_class_dataset
 from streaming.util import get_list_arg
-from streaming.vision import convert_image_class_dataset
 
 
 def parse_args() -> Namespace:

diff --git a/examples/vision/cifar10/write_fake.py b/examples/vision/cifar10/write_fake.py
@@ -7,7 +7,7 @@
 import numpy as np
 from PIL import Image
 
-from streaming.vision import convert_image_class_dataset
+from streaming.modality.vision import convert_image_class_dataset
 
 
 def parse_args() -> Namespace:

diff --git a/examples/vision/coco/__init__.py b/examples/vision/coco/__init__.py
@@ -1,7 +1,4 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """COCO streaming dataset example."""
diff --git a/examples/vision/imagenet/__init__.py b/examples/vision/imagenet/__init__.py
@@ -1,7 +1,4 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """ImageNet streaming dataset example."""
diff --git a/examples/vision/imagenet/read.py b/examples/vision/imagenet/read.py
@@ -9,7 +9,7 @@
 
 from typing import Any, Dict
 
-from streaming.vision import StreamingVisionDataset
+from streaming.modality.vision import StreamingVisionDataset
 
 __all__ = ['StreamingImageNet']
 

diff --git a/notebooks/spark_dataframe_to_MDS.ipynb b/notebooks/spark_dataframe_to_MDS.ipynb
@@ -136,13 +136,10 @@
         },
         {
             "cell_type": "code",
-<<<<<<< HEAD:examples/spark_dataframe_to_MDS.ipynb
             "execution_count": null,
-=======
             "source": [
                 "from streaming.converters import dataframeToMDS"
             ],
->>>>>>> 7f5d160 (Move examples out, merge base/ upward (#494)):notebooks/spark_dataframe_to_MDS.ipynb
             "metadata": {
                 "id": "uzYHe6yYRzyV"
             },

diff --git a/scripts/long_lines.py b/scripts/long_lines.py
@@ -1,9 +1,6 @@
 # Copyright 2022-2024 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2023 MosaicML Streaming authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Note long lines."""
 
 import os
@@ -13,6 +10,8 @@
 from re import Pattern
 from typing import IO, Iterator, Optional
 
+import numpy as np
+
 
 def parse_args() -> Namespace:
     """Parse command-line arguments.
@@ -59,6 +58,12 @@ def parse_args() -> Namespace:
         default='light',
         help='Whether to output in color. Supported options: none, light.',
     )
+    args.add_argument(
+        '--fancy',
+        type=int,
+        default=0,
+        help='Whether to do fancy output, which is harder to parse programmatically.',
+    )
     return args.parse_args()
 
 
@@ -166,25 +171,43 @@ def main(args: Namespace) -> int:
         txt = ', '.join(sorted(non_text_behaviors))
         raise ValueError(f'Unknown non-text behavior (must be one of: {txt}): {args.non_text}.')
 
-    count = 0
+    pairs = []
     for path in sorted(each_path(args.root, include, exclude)):
         if not (file := open_text(path, args.non_text)):
             continue
 
         lines = map(drop_newline, file)
         for line_no, line in enumerate(lines):
             if args.max_len < len(line):
-                good_line = line[:args.max_len]
+                fg_len = len(f'{path}:{line_no}:')
+                good_line = line[fg_len:args.max_len]
                 bad_line = line[args.max_len:]
+
                 if args.color == 'light':
-                    path = f'\033[0;97m{path}\033[0;0m'
-                    line_no = f'\033[0;92m{line_no}\033[0;0m'
-                    good_line = f'\033[0;94m{good_line}\033[0;0m'
-                    bad_line = f'\033[0;91m{bad_line}\033[0;0m'
-                print(f'{path}:{line_no}:{good_line}{bad_line}')
-                count += 1
-
-    return 1 if count else 0
+                    color_path = f'\033[0;97m{path}\033[0;0m'
+                    line_no = f'\033[1;92m{line_no}\033[0;0m'
+                    good_line = f'\033[1;34m{good_line}\033[0;0m'
+                    bad_line = f'\033[1;91m{bad_line}\033[0;0m'
+                else:
+                    color_path = str(path)
+
+                out_line = f'{color_path}:{line_no}:{good_line}{bad_line}\n'
+                pair = len(line), out_line
+                pairs.append(pair)
+
+    vis_lens, _ = zip(*pairs)
+    max_vis_len = max(vis_lens)
+    vocab = 0x2571, 0x2572
+    for vis_len, out_line in pairs:
+        if args.pad:
+            count = max_vis_len - vis_len + 1
+            ords = np.random.choice(vocab, count)
+            pad = ''.join(map(chr, ords))
+            print(f'{out_line[:-1]}{chr(0x2523)}{pad}')
+        else:
+            print(out_line[:-1])
+
+    return 1 if pairs else 0
 
 
 if __name__ == '__main__':

diff --git a/simulation/core/sim_dataset.py b/simulation/core/sim_dataset.py
@@ -269,7 +269,7 @@ def __init__(self,
         local_foldernames = []
         for stream_id, stream in enumerate(self.streams):
             logger.info(f' Processing index file for stream {stream_id + 1}')
-            stream_shards = stream.get_shards(self.world, self.allow_unsafe_types)
+            stream_shards = stream.load_index()
             num_stream_samples = sum(map(len, stream_shards))
             index_filename = os.path.join(stream.local, stream.split or '', get_index_basename())
             index_filenames.append(index_filename)
@@ -290,7 +290,7 @@ def __init__(self,
         # Check that cache limit is possible.
         if cache_limit:
             self.cache_limit = normalize_bytes(cache_limit)
-            min_cache_usage = sum((stream.get_index_size() for stream in streams))
+            min_cache_usage = sum((stream.got_index_size for stream in streams))
             if self.cache_limit <= min_cache_usage:
                 raise ValueError(f'Minimum cache usage ({min_cache_usage} bytes) is larger than ' +
                                  f'the cache limit ({self.cache_limit} bytes). Please raise ' +

diff --git a/streaming/base/converters/README.md b/streaming/base/converters/README.md
diff --git a/streaming/base/format/base/__init__.py b/streaming/base/format/base/__init__.py
diff --git a/streaming/base/shared/__init__.py b/streaming/base/shared/__init__.py