From efc13cf5b24aafebd375b807b12d2ca5faddab8c Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Mon, 30 Dec 2024 15:56:14 +0530 Subject: [PATCH] fixup: complete splitPath, add splitRoot --- .../Internal/FileSystem/Path/Common.hs | 382 +++++++++++++++--- 1 file changed, 330 insertions(+), 52 deletions(-) diff --git a/core/src/Streamly/Internal/FileSystem/Path/Common.hs b/core/src/Streamly/Internal/FileSystem/Path/Common.hs index d3186d36a3..5e097bcb2b 100644 --- a/core/src/Streamly/Internal/FileSystem/Path/Common.hs +++ b/core/src/Streamly/Internal/FileSystem/Path/Common.hs @@ -36,13 +36,22 @@ module Streamly.Internal.FileSystem.Path.Common , append , unsafeAppend + , splitRoot + -- , dropRoot + -- , joinRoot , splitPath , unsafeJoinPaths + -- , processParentRefs -- * Utilities , wordToChar , charToWord , unsafeIndexChar + + -- * Internal + , unsafeSplitTopLevel + , unsafeSplitDrive + , unsafeSplitUNC ) where @@ -255,12 +264,18 @@ isSeparatorWord os = isSeparator os . wordToChar -- Path normalization ------------------------------------------------------------------------------ -countTrailingBy :: Unbox a => (a -> Bool) -> Array a -> Int -countTrailingBy p arr = +countWhile :: (a -> Bool) -> Stream Identity a -> Int +countWhile p = runIdentity - $ Stream.fold Fold.length - $ Stream.takeWhile p - $ Array.readRev arr + . Stream.fold Fold.length + . Stream.takeWhile p + +{-# INLINE countLeadingBy #-} +countLeadingBy :: Unbox a => (a -> Bool) -> Array a -> Int +countLeadingBy p = countWhile p . Array.read + +countTrailingBy :: Unbox a => (a -> Bool) -> Array a -> Int +countTrailingBy p = countWhile p . Array.readRev -- | If the path is @//@ the result is @/@. If it is @a//@ then the result is -- @a@. @@ -304,60 +319,83 @@ unsafeHasDrive a -- | A path that starts with a alphabet followed by a colon e.g. @C:...@. hasDrive :: (Unbox a, Integral a) => Array a -> Bool -hasDrive a = Array.byteLength a >= 2 && unsafeHasDrive a +hasDrive a = Array.length a >= 2 && unsafeHasDrive a -- | A path that contains only an alphabet followed by a colon e.g. @C:@. isDrive :: (Unbox a, Integral a) => Array a -> Bool -isDrive a = Array.byteLength a == 2 && unsafeHasDrive a +isDrive a = Array.length a == 2 && unsafeHasDrive a ------------------------------------------------------------------------------ -- Relative or Absolute ------------------------------------------------------------------------------ -- | A path relative to cur dir it is either @.@ or starts with @./@. -isRelativeCurDir :: (Unbox a, Integral a) => Array a -> Bool -isRelativeCurDir a - -- Assuming the path is not empty. +isRelativeCurDir :: (Unbox a, Integral a) => OS -> Array a -> Bool +isRelativeCurDir os a + | len == 0 = False -- empty path should not occur | wordToChar (Array.getIndexUnsafe 0 a) /= '.' = False - | Array.byteLength a < 2 = True - | otherwise = isSeparator Windows (wordToChar (Array.getIndexUnsafe 1 a)) + | len < 2 = True + | otherwise = isSeparatorWord os (Array.getIndexUnsafe 1 a) + + where + + len = Array.length a + +-- | A path starting with a separator. +hasLeadingSeparator :: (Unbox a, Integral a) => OS -> Array a -> Bool +hasLeadingSeparator os a + | Array.length a == 0 = False -- empty path should not occur + | isSeparatorWord os (Array.getIndexUnsafe 0 a) = True + | otherwise = False + +-- | A non-UNC path starting with a separator. +isRelativeCurDriveRoot :: (Unbox a, Integral a) => Array a -> Bool +isRelativeCurDriveRoot a + | len == 0 = False -- empty path should not occur + | len == 1 && sep0 = True + | sep0 && c0 /= c1 = True + | otherwise = False + + where --- | The path starting with a separator. On Windows this is relative to current --- drive while on Posix this is absolute path as there is only one drive. -isRelativeCurDrive :: (Unbox a, Integral a) => OS -> Array a -> Bool -isRelativeCurDrive os a = - -- Assuming the path is not empty. - isSeparator os (wordToChar (Array.getIndexUnsafe 0 a)) + len = Array.length a + c0 = Array.getIndexUnsafe 0 a + c1 = Array.getIndexUnsafe 1 a + sep0 = isSeparatorWord Windows c0 -- | @C:@ or @C:a...@. isRelativeWithDrive :: (Unbox a, Integral a) => Array a -> Bool isRelativeWithDrive a = hasDrive a - && ( Array.byteLength a < 3 + && ( Array.length a < 3 || not (isSeparator Windows (unsafeIndexChar 2 a)) ) -- | @C:\...@. Note that "C:" or "C:a" is not absolute. isAbsoluteWithDrive :: (Unbox a, Integral a) => Array a -> Bool isAbsoluteWithDrive a = - Array.byteLength a >= 3 + Array.length a >= 3 && unsafeHasDrive a && isSeparator Windows (unsafeIndexChar 2 a) --- | @\\\\...@ +-- | @\\\\...@ or @//...@ isAbsoluteUNC :: (Unbox a, Integral a) => Array a -> Bool isAbsoluteUNC a - | Array.byteLength a < 2 = False - | unsafeIndexChar 0 a /= '\\' = False - | unsafeIndexChar 1 a /= '\\' = False - | otherwise = True + | Array.length a < 2 = False + | isSeparatorWord Windows c0 && c0 == c1 = True + | otherwise = False + + where + + c0 = Array.getIndexUnsafe 0 a + c1 = Array.getIndexUnsafe 1 a -- | Note that on Windows a path starting with a separator is relative to -- current drive while on Posix this is absolute path as there is only one -- drive. isAbsolute :: (Unbox a, Integral a) => OS -> Array a -> Bool isAbsolute Posix arr = - isRelativeCurDrive Posix arr + hasLeadingSeparator Posix arr isAbsolute Windows arr = isAbsoluteWithDrive arr || isAbsoluteUNC arr @@ -377,11 +415,27 @@ isAbsolute Windows arr = -- -- XXX For the untyped Path we can allow appending "./x" to other paths. We can -- leave this to the programmer. In typed paths we can allow "./x" in segments. --- --- XXX C:\\ is invalid, \\share\ is invalid? -- XXX Empty path can be taken to mean "." except in case of UNC paths - --- | Rooted paths on Posix and Windows, +-- +-- XXX "//share/x" works in powershell. But mixed forward and backward slashes +-- do not work, it is treated as a path relative to current drive e.g. +-- "\\/share/x" is treated as "C:/share/x". +-- +-- Invalid paths: +-- "C:\\\\" +-- "C:\\\\x" +-- "\\\\" +-- "\\\\share" +-- "\\\\share\\" +-- "\\\\share\\\\" +-- "\\\\share\\\\x" +-- "\\\\?\\c:" +-- "\\\\?\\c:\\\\\\" + +-- | Any path that starts with a separator, @./@ or a drive prefix is a rooted +-- path. +-- +-- Rooted paths on Posix and Windows, -- * @/...@ a path starting with a separator -- * @.@ current dir -- * @./...@ a location relative to current dir @@ -399,13 +453,12 @@ isAbsolute Windows arr = -- isLocation :: (Unbox a, Integral a) => OS -> Array a -> Bool isLocation Posix a = - isRelativeCurDrive Posix a - || isRelativeCurDir a + hasLeadingSeparator Posix a + || isRelativeCurDir Posix a isLocation Windows a = - isRelativeCurDrive Windows a - || isRelativeCurDir a + hasLeadingSeparator Windows a + || isRelativeCurDir Windows a || hasDrive a -- curdir-in-drive relative, drive absolute - || isAbsoluteUNC a isSegment :: (Unbox a, Integral a) => OS -> Array a -> Bool isSegment os = not . isLocation os @@ -453,8 +506,8 @@ maybeFile os arr = do {-# INLINE doAppend #-} doAppend :: (Unbox a, Integral a) => OS -> Array a -> Array a -> Array a doAppend os a b = unsafePerformIO $ do - let lenA = Array.byteLength a - lenB = Array.byteLength b + let lenA = Array.length a + lenB = Array.length b assertM(lenA /= 0 && lenB /= 0) assertM(countTrailingBy (isSeparatorWord os) a == 0) let len = lenA + 1 + lenB @@ -486,51 +539,276 @@ append :: (Unbox a, Integral a) => append os toStr a b = withAppendCheck os toStr b (doAppend os a b) +------------------------------------------------------------------------------ +-- Splitting +------------------------------------------------------------------------------ + +unsafeSplitPrefix :: (Unbox a, Integral a) => + OS -> Int -> Array a -> (Array a, Array a) +unsafeSplitPrefix os prefixLen arr = (drive, path) + + where + + len = Array.length arr + -- XXX Array.readFrom may be useful here + afterDrive = Array.getSliceUnsafe prefixLen (len - prefixLen) arr + n = countLeadingBy (isSeparatorWord os) afterDrive + cnt = prefixLen + n + drive = Array.getSliceUnsafe 0 cnt arr + path = Array.getSliceUnsafe cnt (len - cnt) arr + +-- XXX We can produce a normalized result for the drive during split. + +-- | Split a path prefixed with a separator into (drive, path) tuple. +-- +-- >>> toListPosix (a,b) = (unpackPosix a, unpackPosix b) +-- >>> splitPosix = toListPosix . Common.unsafeSplitTopLevel Common.Posix . packPosix +-- +-- >>> toListWin (a,b) = (unpackWindows a, unpackWindows b) +-- >>> splitWin = toListWin . Common.unsafeSplitTopLevel Common.Windows . packWindows +-- +-- >>> splitPosix "/" +-- ("/","") +-- +-- >>> splitPosix "//" +-- ("//","") +-- +-- >>> splitPosix "/home" +-- ("/","home") +-- +-- >>> splitPosix "/home/user" +-- ("/","home/user") +-- +-- >>> splitWin "\\" +-- ("\\","") +-- +-- >>> splitWin "\\home" +-- ("\\","home") +unsafeSplitTopLevel :: (Unbox a, Integral a) => + OS -> Array a -> (Array a, Array a) +-- Note on Windows we should be here only when the path starts with exactly one +-- separator, otherwise it would be UNC path. But on posix multiple separators +-- are valid. +unsafeSplitTopLevel os = unsafeSplitPrefix os 1 + +-- In some cases there is no valid drive component e.g. "\\a\\b", though if we +-- consider relative roots then we could use "\\" as the root in this case. In +-- other cases there is no valid path component e.g. "C:" or "\\share\\" though +-- the latter is not a valid path and in the former case we can use "." as the +-- path component. +-- +-- XXX Note, on windows C:\\\\x is an invalid path. + +-- | Split a path prefixed with drive into (drive, path) tuple. +-- +-- >>> toList (a,b) = (unpackPosix a, unpackPosix b) +-- >>> split = toList . Common.unsafeSplitDrive . packPosix +-- +-- >>> split "C:" +-- ("C:","") +-- +-- >>> split "C:a" +-- ("C:","a") +-- +-- >>> split "C:\\" +-- ("C:\\","") +-- +-- >>> split "C:\\\\" -- this is invalid path +-- ("C:\\\\","") +-- +-- >>> split "C:\\\\a" -- this is invalid path +-- ("C:\\\\","a") +-- +-- >>> split "C:\\/a/b" -- is this valid path? +-- ("C:\\/","a/b") +unsafeSplitDrive :: (Unbox a, Integral a) => Array a -> (Array a, Array a) +unsafeSplitDrive = unsafeSplitPrefix Windows 2 + +-- | Skip separators and then parse the next path segment. +-- Return (segment offset, segment length). +parseSegment :: (Unbox a, Integral a) => Array a -> Int -> Int -> (Int, Int) +parseSegment arr len sepOff = (segOff, segCnt) + + where + + arr1 = Array.getSliceUnsafe sepOff (len - sepOff) arr + sepCnt = countLeadingBy (isSeparatorWord Windows) arr1 + segOff = sepOff + sepCnt + + arr2 = Array.getSliceUnsafe segOff (len - segOff) arr + segCnt = countLeadingBy (not . isSeparatorWord Windows) arr2 + +-- XXX We can split a path as "root, . , rest" or "root, /, rest". +-- XXX We can remove the redundant path separator after the root. With that +-- joining root vs other paths will become similar. But there are some special +-- cases e.g. "C:a" does not have a separator, can we make this "C:.\\a"? In +-- case of "/home" we have "/" as root and we cannot add another separator +-- between this and the rest of the path. + +-- | Split a path prefixed with "\\" into (drive, path) tuple. +-- +-- >>> toList (a,b) = (unpackPosix a, unpackPosix b) +-- >>> split = toList . Common.unsafeSplitUNC . packPosix +-- +-- >> split "" +-- ("","") +-- +-- >>> split "\\\\" +-- ("\\\\","") +-- +-- >>> split "\\\\server" +-- ("\\\\server","") +-- +-- >>> split "\\\\server\\" +-- ("\\\\server\\","") +-- +-- >>> split "\\\\server\\home" +-- ("\\\\server\\","home") +-- +-- >>> split "\\\\?\\c:" +-- ("\\\\?\\c:","") +-- +-- >>> split "\\\\?\\c:/" +-- ("\\\\?\\c:/","") +-- +-- >>> split "\\\\?\\c:\\home" +-- ("\\\\?\\c:\\","home") +-- +-- >>> split "\\\\?\\UNC/" +-- ("\\\\?\\UNC/","") +-- +-- >>> split "\\\\?\\UNC\\server" +-- ("\\\\?\\UNC\\server","") +-- +-- >>> split "\\\\?\\UNC/server\\home" +-- ("\\\\?\\UNC/server\\","home") +-- +unsafeSplitUNC :: (Unbox a, Integral a) => Array a -> (Array a, Array a) +unsafeSplitUNC arr = + if cnt1 == 1 && unsafeIndexChar 2 arr == '?' + then do + if uncLen == 3 + && unsafeIndexChar uncOff arr == 'U' + && unsafeIndexChar (uncOff + 1) arr == 'N' + && unsafeIndexChar (uncOff + 2) arr == 'C' + then unsafeSplitPrefix Windows (serverOff + serverLen) arr + else unsafeSplitPrefix Windows sepOff1 arr + else unsafeSplitPrefix Windows sepOff arr + + where + + len = Array.length arr + arr1 = Array.getSliceUnsafe 2 (len - 2) arr + cnt1 = countLeadingBy (not . isSeparatorWord Windows) arr1 + sepOff = 2 + cnt1 + + -- XXX there should be only one separator in a valid path? + -- XXX it should either be UNC or two letter drive in a valid path + (uncOff, uncLen) = parseSegment arr len sepOff + sepOff1 = uncOff + uncLen + (serverOff, serverLen) = parseSegment arr len sepOff1 + +-- XXX should we make the root Maybe? Both components will have to be Maybe to +-- avoid an empty path. + +-- | If a path is rooted then separate the root and the remaining path +-- otherwise root is returned as empty. +-- +-- >>> toList (a,b) = (unpackPosix a, unpackPosix b) +-- >>> splitPosix = toList . Common.splitRoot Common.Posix . packPosix +-- +-- >>> splitPosix "/" +-- ("/","") +-- +-- >>> splitPosix "." +-- (".","") +-- +-- >>> splitPosix "/home" +-- ("/","home") +-- +-- >>> splitPosix "//" +-- ("//","") +-- +-- >>> splitPosix "./home" +-- ("./","home") +-- +-- >>> splitPosix "home" +-- ("","home") +-- +{-# INLINE splitRoot #-} +splitRoot :: (Unbox a, Integral a) => OS -> Array a -> (Array a, Array a) +splitRoot Posix arr + | isLocation Posix arr + = unsafeSplitTopLevel Posix arr + | otherwise = (Array.empty, arr) +splitRoot Windows arr + | isRelativeCurDriveRoot arr || isRelativeCurDir Windows arr + = unsafeSplitTopLevel Windows arr + | hasDrive arr = unsafeSplitDrive arr + | isAbsoluteUNC arr = unsafeSplitUNC arr + | otherwise = (Array.empty, arr) + -- | Split a path into components separated by the path separator. "." -- components in the path are ignored except in the leading position. Multiple -- consecutive separators are ignored. -- -- >>> :{ --- splitPath Common.Posix = Stream.toList . fmap unpackPosix . Common.splitPath Common.Posix . packPosix --- splitPath Common.Windows = Stream.toList . fmap unpackWindows . Common.splitPath Common.Windows . packWindows +-- splitPosix = Stream.toList . fmap unpackPosix . Common.splitPath Common.Posix . packPosix +-- splitWin = Stream.toList . fmap unpackWindows . Common.splitPath Common.Windows . packWindows -- :} -- --- >>> splitPath Common.Posix "." +-- >>> splitPosix "." -- ["."] -- --- >>> splitPath Common.Posix "././" --- ["."] +-- >>> splitPosix "././" +-- ["./"] -- --- >>> splitPath Common.Posix "./a/b/." --- [".","a","b"] +-- >>> splitPosix "./a/b/." +-- ["./","a","b"] -- --- >>> splitPath Common.Posix "home//user/./..////\\directory/." +-- >>> splitPosix "/" +-- ["/"] +-- +-- >>> splitPosix "/home" +-- ["/","home"] +-- +-- >>> splitWin "/home" +-- ["/","home"] +-- +-- >>> splitPosix "home//user/./..////\\directory/." -- ["home","user","..","\\directory"] -- --- >>> splitPath Common.Windows "home//user/./..////\\directory/." +-- >>> splitWin "home//user/./..////\\directory/." -- ["home","user","..","directory"] -- {-# INLINE splitPath #-} splitPath - :: forall a m. (Unbox a, Integral a, MonadIO m) + :: (Unbox a, Integral a, MonadIO m) => OS -> Array a -> Stream m (Array a) splitPath os arr = - Stream.indexEndBy_ (isSeparatorWord os) (Array.read arr) - & Stream.filter (not . shouldFilterOut) - & fmap (\(i, len) -> Array.getSliceUnsafe i len arr) + let stream = + Stream.indexEndBy_ (isSeparatorWord os) (Array.read rest) + & Stream.filter (not . shouldFilterOut) + & fmap (\(i, len) -> Array.getSliceUnsafe i len rest) + + in if Array.length root == 0 + then stream + else Stream.cons root stream where + (root, rest) = splitRoot os arr + shouldFilterOut (off, len) = len == 0 || - (len == 1 && unsafeIndexChar off arr == '.') + (len == 1 && unsafeIndexChar off rest == '.') -- | Join paths by path separator. Does not check if the paths being appended -- are rooted or path segments. Note that splitting and joining may not give -- exactly the original path but an equivalent, normalized path. {-# INLINE unsafeJoinPaths #-} unsafeJoinPaths - :: forall a m. (Unbox a, Integral a, MonadIO m) + :: (Unbox a, Integral a, MonadIO m) => OS -> Stream m (Array a) -> m (Array a) unsafeJoinPaths os = -- XXX This can be implemented more efficiently using an Array intersperse