From c26645e5779df420a0f813b809214a09d551f48a Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sun, 19 Jan 2025 09:35:13 +0530 Subject: [PATCH] Allow roots in fromChunk, fix redundant seps in append --- .../Internal/FileSystem/Path/Common.hs | 250 ++++++++++++++---- .../Streamly/Internal/FileSystem/PosixPath.hs | 54 ++-- 2 files changed, 227 insertions(+), 77 deletions(-) diff --git a/core/src/Streamly/Internal/FileSystem/Path/Common.hs b/core/src/Streamly/Internal/FileSystem/Path/Common.hs index adde3cffdf..11be7bc203 100644 --- a/core/src/Streamly/Internal/FileSystem/Path/Common.hs +++ b/core/src/Streamly/Internal/FileSystem/Path/Common.hs @@ -11,8 +11,10 @@ module Streamly.Internal.FileSystem.Path.Common OS (..) -- * Construction - , isValid + , isValidPath + , isValidPath' , validatePath + , validatePath' , validateFile , fromChunk , unsafeFromChunk @@ -42,6 +44,7 @@ module Streamly.Internal.FileSystem.Path.Common -- operation. search path is separated by : and : is allowed in paths on -- posix. Shell would escape it which needs to be handled. , append + , append' , unsafeAppend , unsafeJoinPaths @@ -417,6 +420,43 @@ isAbsolute Windows arr = -- * @\\\\.\\@ DOS local device namespace -- * @\\\\??\\@ DOS global namespace -- +-- >>> fPosix = Common.isRooted Common.Posix . packPosix +-- >>> fWin = Common.isRooted Common.Windows . packPosix +-- +-- >>> fPosix "/" +-- True +-- >>> fPosix "/x" +-- True +-- >>> fPosix "." +-- True +-- >>> fPosix "./x" +-- True +-- >>> fPosix ".." +-- False +-- >>> fPosix "../x" +-- False +-- +-- >>> fWin "/" +-- True +-- >>> fWin "/x" +-- True +-- >>> fWin "." +-- True +-- >>> fWin "./x" +-- True +-- >>> fWin ".." +-- False +-- >>> fWin "../x" +-- False +-- >>> fWin "c:" +-- True +-- >>> fWin "c:x" +-- True +-- >>> fWin "c:/" +-- True +-- >>> fWin "//x/y" +-- True +-- isRooted :: (Unbox a, Integral a) => OS -> Array a -> Bool isRooted Posix a = hasLeadingSeparator Posix a @@ -603,8 +643,10 @@ unsafeSplitUNC arr = -- otherwise root is returned as empty. If the path is rooted then the non-root -- part is guaranteed to not start with a separator. -- --- >>> toList (a,b) = (unpackPosix a, unpackPosix b) --- >>> splitPosix = toList . Common.splitRoot Common.Posix . packPosix +-- >>> toListPosix (a,b) = (unpackPosix a, unpackPosix b) +-- >>> splitPosix = toListPosix . Common.splitRoot Common.Posix . packPosix +-- >>> toListWin (a,b) = (unpackWindows a, unpackWindows b) +-- >>> splitWin = toListWin . Common.splitRoot Common.Windows . packWindows -- -- >>> splitPosix "/" -- ("/","") @@ -612,6 +654,9 @@ unsafeSplitUNC arr = -- >>> splitPosix "." -- (".","") -- +-- >>> splitPosix "./" +-- ("./","") +-- -- >>> splitPosix "/home" -- ("/","home") -- @@ -624,11 +669,28 @@ unsafeSplitUNC arr = -- >>> splitPosix "home" -- ("","home") -- +-- >>> splitWin "c:" +-- ("c:","") +-- +-- >>> splitWin "c:/" +-- ("c:/","") +-- +-- >>> splitWin "//" +-- ("//","") +-- +-- >>> splitWin "//x/y" +-- ("//x/","y") +-- +-- {-# INLINE splitRoot #-} splitRoot :: (Unbox a, Integral a) => OS -> Array a -> (Array a, Array a) -- NOTE: validatePath depends on splitRoot splitting the path without removing -- any redundant chars etc. It should just split and do nothing else. -- XXX We can put an assert here "arrLen == rootLen + stemLen". +-- XXX assert (isValidPath path == isValidPath root) +-- +-- NOTE: we cannot drop the trailing "/" on the root even if we want to - +-- because "c:/" will become "c:" and the two are not equivalent. splitRoot Posix arr | isRooted Posix arr = unsafeSplitTopLevel Posix arr @@ -1367,8 +1429,9 @@ isInvalidPathComponent = fmap (fmap charToWord) , "LPT1","LPT2","LPT3","LPT4","LPT5","LPT6","LPT7","LPT8","LPT9" ] -validatePath :: (MonadThrow m, Integral a, Unbox a) => OS -> Array a -> m () -validatePath Posix path = +validatePathWith :: (MonadThrow m, Integral a, Unbox a) => + Bool -> OS -> Array a -> m () +validatePathWith _ Posix path = let pathLen = Array.length path validLen = countLeadingValid Posix path in if pathLen == 0 @@ -1377,10 +1440,9 @@ validatePath Posix path = then throwM $ InvalidPath $ "Null char found after " ++ show validLen ++ " characters." else pure () -validatePath Windows path +validatePathWith allowRoot Windows path | Array.null path = throwM $ InvalidPath "Empty path" | otherwise = do - -- XXX give position of the first invalid char. if hasDrive path && postDriveSep > 1 -- "C://" then throwM $ InvalidPath $ "More than one separators between drive root and the path" @@ -1389,18 +1451,15 @@ validatePath Windows path if postDriveSep > 1 -- "///x" then throwM $ InvalidPath $ "Path starts with more than two separators" - -- XXX covered by the previous check - -- else if Array.length path == postDriveSep + 2 - -- then throwM $ InvalidPath $ "Only separators in share root" else if invalidRootComponent -- "//prn/x" then throwM $ InvalidPath -- XXX print the invalid component name - $ "Special filename component in share root" + $ "Special filename component found in share root" else if rootEndSeps /= 1 -- "//share//x" then throwM $ InvalidPath $ "Share name is needed and exactly one separator is needed " - ++ "between share root and the path" - else if Array.null stem -- "//share/" + ++ "after the share root" + else if not allowRoot && Array.null stem -- "//share/" then throwM $ InvalidPath $ "the share root must be followed by a non-empty path" else pure () @@ -1467,16 +1526,29 @@ validatePath Windows path invalidComponent = List.any (`List.elem` isInvalidPathComponent) (components stem) +-- | A valid root, share root or a valid path. +validatePath :: (MonadThrow m, Integral a, Unbox a) => OS -> Array a -> m () +validatePath = validatePathWith True + +-- | Like validatePath but on Windows only full paths are allowed, path roots +-- only are not allowed. Thus "//x/" is not valid. +validatePath' :: (MonadThrow m, Integral a, Unbox a) => OS -> Array a -> m () +validatePath' = validatePathWith False + -- Note: We can use powershell for testing path validity. -- "//share/x" works in powershell. -- But mixed forward and backward slashes do not work, it is treated as a path -- relative to current drive e.g. "\\/share/x" is treated as "C:/share/x". +-- +-- XXX Note: Windows may have case sensitive behavior depending on the file +-- system being used. Does it impact any of the case insensitive validations +-- below? --- | Check if the filepath is valid i.e. does the operating system allow such a --- path in listing or creating files? +-- | Check if the filepath is valid i.e. does the operating system or the file +-- system allow such a path in listing or creating files? -- --- >>> isValidPosix = Common.isValid Common.Posix . packPosix --- >>> isValidWin = Common.isValid Common.Windows . packWindows +-- >>> isValidPosix = Common.isValidPath Common.Posix . packPosix +-- >>> isValidWin = Common.isValidPath Common.Windows . packWindows -- -- Posix and Windows: -- @@ -1514,7 +1586,7 @@ validatePath Windows path -- False -- >>> isValidWin "c:\\ pRn \\x" -- False --- >>> isValidWin "pRn.x.txt" -- is this allowed? +-- >>> isValidWin "pRn.x.txt" -- False -- -- Windows drive root validations: @@ -1547,7 +1619,7 @@ validatePath Windows path -- >>> isValidWin "\\\\x" -- False -- >>> isValidWin "\\\\x\\" --- False +-- True -- >>> isValidWin "\\\\x\\y" -- True -- >>> isValidWin "//x/y" @@ -1580,8 +1652,8 @@ validatePath Windows path -- -- Windows long UNC path validations: -- --- >>> isValidWin "\\\\?\\UnC\\x" --- False +-- >>> isValidWin "\\\\?\\UnC\\x" -- UnC treated as share name +-- True -- >>> isValidWin "\\\\?\\UNC\\x" -- True -- >>> isValidWin "\\\\?\\UNC\\c:\\x" @@ -1593,12 +1665,27 @@ validatePath Windows path -- True -- >>> isValidWin "\\\\??\\x" -- True -isValid :: (Integral a, Unbox a) => OS -> Array a -> Bool -isValid os path = +isValidPath :: (Integral a, Unbox a) => OS -> Array a -> Bool +isValidPath os path = case validatePath os path of Nothing -> False Just _ -> True +-- | +-- >>> isValidWin = Common.isValidPath' Common.Windows . packWindows +-- +-- The following roots allowed by isValidPath are not allowed: +-- +-- >>> isValidWin "\\\\x\\" +-- False +-- >>> isValidWin "\\\\?\\UNC\\x" +-- False +isValidPath' :: (Integral a, Unbox a) => OS -> Array a -> Bool +isValidPath' os path = + case validatePath' os path of + Nothing -> False + Just _ -> True + -- A chunk is essentially an untyped Array i.e. Array Word8. We can either use -- the term ByteArray for that or just Chunk. The latter is shorter and we have -- been using it consistently in streamly. We use "bytes" for a stream of @@ -1611,8 +1698,6 @@ isValid os path = unsafeFromChunk :: Array Word8 -> Array a unsafeFromChunk = Array.unsafeCast --- XXX Also check for invalid chars on windows. - -- | On Posix it may fail if the byte array contains null characters. On -- Windows the array passed must be a multiple of 2 bytes as the underlying -- representation uses 'Word16'. @@ -1644,11 +1729,14 @@ unsafeFromChars encode s = let n = runIdentity $ Stream.fold Fold.length s in Array.fromPureStreamN n (encode s) --- Note: We do not sanitize the path i.e. remove duplicate separators, . +-- | Note: We do not sanitize the path i.e. remove duplicate separators, . -- segments, trailing separator etc because that would require unnecessary --- checks and modifications to the path which may not be required, this is only --- needed for path equality and is done during the equality check. If --- normalization is desired users can do it explicitly. +-- checks and modifications to the path which may not be used ever for any +-- useful purpose, it is only needed for path equality and can be done during +-- the equality check. If normalization is desired users can do it explicitly. +-- +-- fromChars should accept both - just root or path. Otherwise we will need a +-- separate type for Root. -- -- XXX Writing a custom fold for parsing a Posix path may be better for -- efficient bulk parsing when needed. We need the same code to validate a @@ -1693,22 +1781,32 @@ mkQ f = -- Operations of Path ------------------------------------------------------------------------------ --- XXX This can be generalized to an Array intersperse operation - {-# INLINE doAppend #-} doAppend :: (Unbox a, Integral a) => OS -> Array a -> Array a -> Array a doAppend os a b = unsafePerformIO $ do let lenA = Array.length a lenB = Array.length b assertM(lenA /= 0 && lenB /= 0) - assertM(countTrailingBy (isSeparatorWord os) a == 0) + let lastA = Array.unsafeGetIndexRev 0 a + sepA = isSeparatorWord os lastA + sepB = isSeparatorWord os (Array.unsafeGetIndex 0 b) let len = lenA + 1 + lenB arr <- MutArray.emptyOf len arr1 <- MutArray.unsafeSplice arr (Array.unsafeThaw a) - -- XXX Do not add the separator if already present in the first or the - -- second path - arr2 <- MutArray.unsafeSnoc arr1 (charToWord (primarySeparator os)) - arr3 <- MutArray.unsafeSplice arr2 (Array.unsafeThaw b) + arr2 <- + if ( lenA /= 0 + && lenB /= 0 + && not sepA + && not sepB + && not (os == Windows && lastA == charToWord ':') + ) + then MutArray.unsafeSnoc arr1 (charToWord (primarySeparator os)) + else pure arr1 + let arrB = + if (sepA && sepB) + then snd $ Array.unsafeSplitAt 1 b + else b + arr3 <- MutArray.unsafeSplice arr2 (Array.unsafeThaw arrB) return (Array.unsafeFreeze arr3) {-# INLINE withAppendCheck #-} @@ -1721,36 +1819,84 @@ withAppendCheck os toStr arr f = -- | Does not check if any of the path is empty or if the second path is -- absolute. +-- +-- >>> appendPosix a b = unpackPosix $ Common.unsafeAppend Common.Posix (Common.toString Unicode.decodeUtf8) (packPosix a) (packPosix b) +-- >>> appendWin a b = unpackWindows $ Common.unsafeAppend Common.Windows (Common.toString Unicode.decodeUtf16le') (packWindows a) (packWindows b) +-- +-- >>> appendPosix "x" "y" +-- "x/y" +-- >>> appendPosix "x/" "y" +-- "x/y" +-- >>> appendPosix "x" "/y" +-- "x/y" +-- >>> appendPosix "x/" "/y" +-- "x/y" +-- {-# INLINE unsafeAppend #-} unsafeAppend :: (Unbox a, Integral a) => OS -> (Array a -> String) -> Array a -> Array a -> Array a unsafeAppend os toStr a b = assert (withAppendCheck os toStr b True) (doAppend os a b) --- XXX Note: an altrenative way of joining "c:" and "x" could be treat ":" as a --- separator and not add a "/". If someone wants to add a slash then they can --- append it to the root e.g. append "c:" "/". If we do this then we will not --- need a joinRoot. --- --- XXX Also, we cannot append "/" to "c:/" as it will make the path invalid. --- XXX On Windows a path starting with / is not absolute and can be appended to --- a path/drive ending with :. - -- | Note that append joins two paths using a separator between the paths. --- Using append to join a root with a path segment can change the meaning of --- the path on windows at least in one case e.g. "c:/x" is not the same as --- "c:x". For such cases we should use joinRoot. +-- +-- On Windows, joining a drive "c:" with "x" does not add a separator between +-- the two because "c:x" is different from "c:/x". Note "c:" and "/x" are both +-- rooted paths, therefore, append cannot be used to join them. You will need +-- to use dropRoot on the second path before joining them. Similarly for +-- joining "//x/" and "/y". +-- +-- >>> import Data.Either (Either, isLeft) +-- >>> import Control.Exception (SomeException, evaluate, try) -- -- >>> appendPosix a b = unpackPosix $ Common.append Common.Posix (Common.toString Unicode.decodeUtf8) (packPosix a) (packPosix b) +-- >>> appendWin a b = unpackWindows $ Common.append Common.Windows (Common.toString Unicode.decodeUtf16le') (packWindows a) (packWindows b) +-- >>> failPosix a b = (try (evaluate (appendPosix a b)) :: IO (Either SomeException String)) >>= return . isLeft +-- >>> failWin a b = (try (evaluate (appendWin a b)) :: IO (Either SomeException String)) >>= return . isLeft -- -- >>> appendPosix "x" "y" -- "x/y" +-- >>> appendPosix "x/" "y" +-- "x/y" +-- >>> failPosix "x" "/" +-- True -- +-- >>> appendWin "x" "y" +-- "x\\y" +-- >>> appendWin "x/" "y" +-- "x/y" +-- >>> appendWin "c:" "x" +-- "c:x" +-- >>> appendWin "c:/" "x" +-- "c:/x" +-- >>> appendWin "//x" "y" +-- "//x\\y" +-- >>> appendWin "//x/" "y" +-- "//x/y" +-- +-- >>> failWin "c:" "/" +-- True +-- >>> failWin "c:" "/x" +-- True +-- >>> failWin "c:/" "/x" +-- True +-- >>> failWin "//x/" "/y" +-- True {-# INLINE append #-} append :: (Unbox a, Integral a) => OS -> (Array a -> String) -> Array a -> Array a -> Array a append os toStr a b = - withAppendCheck os toStr b (doAppend os a b) + assert(countTrailingBy (isSeparatorWord os) a == 0) + (withAppendCheck os toStr b (doAppend os a b)) + +-- | A stricter version of append which requires the first path to be a +-- directory like path i.e. with a trailing separator. +-- +-- /Unimplemented/ +{-# INLINE append' #-} +append' :: -- (Unbox a, Integral a) => + OS -> (Array a -> String) -> Array a -> Array a -> Array a +append' = undefined -- XXX MonadIO? @@ -1830,9 +1976,9 @@ eqWindowsAbsRootStrict a b = (fmap toDefaultSeparator $ Array.read b) -- XXX Use options in the same eqPath routine instead of having different --- routines. On posix even macos can have case insensitive comparison. --- ALLOW_RELATIVE_PATH_EQUALITY, IGNORE_TRAILING_SEPARATOR, --- IGNORE_CASE. +-- routines. On posix even macOs can have case insensitive comparison. On +-- Windows also case sensitive behavior may depend on the file system being +-- used. ALLOW_RELATIVE_EQ, IGNORE_TRAILING_SEPARATOR, IGNORE_CASE. -- -- The following options can be added later: PROCESS_PARENT_REFS, -- DONT_IGNORE_REDUNDANT_SEPARATORS, DONT_IGNORE_DOT_COMPONENTS. diff --git a/core/src/Streamly/Internal/FileSystem/PosixPath.hs b/core/src/Streamly/Internal/FileSystem/PosixPath.hs index d1597b1e86..aa91e9c54d 100644 --- a/core/src/Streamly/Internal/FileSystem/PosixPath.hs +++ b/core/src/Streamly/Internal/FileSystem/PosixPath.hs @@ -43,7 +43,8 @@ module Streamly.Internal.FileSystem.OS_PATH -- * Construction , validatePath - , isValid + , validatePath' + , isValidPath , fromChunk , unsafeFromChunk , fromChars @@ -138,36 +139,31 @@ For APIs that have not been released yet. -- | A type representing file system paths on OS_NAME. -- --- A OS_PATH is validated before construction unless unsafe constructors are used --- to create it. Rules and invariants maintained by the safe construction --- methods are as follows: --- --- * Does not contain a null character. --- * Does not have a trailing separator except in the root path. --- * Does not have a trailing @.@ component. --- * Does not have consecutive separators except in UNC prefixes on Windows. --- * Does not contain @\/.\/@ path components except in a UNC prefix on --- Windows. +-- A OS_PATH is validated before construction unless unsafe constructors are +-- used to create it. For validations performed by the safe construction +-- methods see the 'fromChars' function. -- -- Note that in some cases the file system may perform unicode normalization on -- paths (e.g. Apple HFS), it may cause surprising results as the path used by -- the user may not have the same bytes as later returned by the file system. newtype OS_PATH = OS_PATH (Array WORD_TYPE) --- Show instance prints raw bytes without any decoding for rountdtripping. We --- can use a Lax Utf8 decoding and print it as a string for convenience? Should --- we print raw path as a string instead, may be useful for ascii chars but --- utf8 encoded chars may be unprintable. Better use toString if you want to --- pretty print the path. +-- Show instance is not provided because Show and Read should be inverses but +-- we cannot ensure that as the path encoding may depend on the OS or the +-- file system. We can print the byte values though but that won't be very +-- useful. If we do not care about Show and Read being striclty faithful +-- inverses we can use the default encoding/decoding to implement them. +-- Otherwise we can just use toString, fromString for Show and Read purposes. +-- {- instance Show OS_PATH where show (OS_PATH x) = show x -} --- XXX The Eq instance needs to make sure that the paths are equivalent. If we --- normalize the paths we can do a byte comparison. However, on windows paths --- are case insensitive but the case is preserved, therefore, we cannot --- normalize and need to do case insensitive comparison. +-- XXX The Eq instance may be provided but it will require some sensible +-- defaults for comparison. For example, should we use case sensitive or +-- insensitive comparison? It depends on the underlying file system. For now +-- now we have eqPath operations for equality comparison. instance IsPath OS_PATH OS_PATH where unsafeFromPath = id @@ -176,8 +172,8 @@ instance IsPath OS_PATH OS_PATH where -- XXX Use rewrite rules to eliminate intermediate conversions for better -- efficiency. If the argument path is already verfied for a property, we --- should not verify it again e.g. if we adapt (Loc path) as (Loc (Dir path)) --- then we should not verify it to be Loc again. +-- should not verify it again e.g. if we adapt (Rooted path) as (Rooted (Dir +-- path)) then we should not verify it to be Rooted again. -- XXX castPath? @@ -203,8 +199,17 @@ dropTrailingSeparators (OS_PATH arr) = validatePath :: MonadThrow m => OS_PATH -> m () validatePath (OS_PATH a) = Common.validatePath Common.OS_NAME a -isValid :: OS_PATH -> Bool -isValid (OS_PATH a) = Common.isValid Common.OS_NAME a +isValidPath :: OS_PATH -> Bool +isValidPath (OS_PATH a) = Common.isValidPath Common.OS_NAME a + +-- Note: CPP gets confused by the prime suffix, so we have to put the CPP +-- macros on the next line to get it to work. + +validatePath' :: MonadThrow m => + OS_PATH -> m () +validatePath' + (OS_PATH a) = Common.validatePath' + Common.OS_NAME a ------------------------------------------------------------------------------ -- Construction @@ -243,7 +248,6 @@ fromChunk arr = Common.fromChunk Common.OS_NAME arr >>= fromPath . OS_PATH -- * the stream contains invalid unicode characters #if defined(IS_WINDOWS) -- * the path starts with more than 2 separators --- * the share root must be followed by a non-empty path -- * the root drive or share name and the path is separated by more than one separators -- * the path contains special characters not allowed in paths -- * the path contains special file names not allowed in paths