From efc13cf5b24aafebd375b807b12d2ca5faddab8c Mon Sep 17 00:00:00 2001
From: Harendra Kumar <harendra@composewell.com>
Date: Mon, 30 Dec 2024 15:56:14 +0530
Subject: [PATCH] fixup: complete splitPath, add splitRoot

---
 .../Internal/FileSystem/Path/Common.hs        | 382 +++++++++++++++---
 1 file changed, 330 insertions(+), 52 deletions(-)

diff --git a/core/src/Streamly/Internal/FileSystem/Path/Common.hs b/core/src/Streamly/Internal/FileSystem/Path/Common.hs
index d3186d36a3..5e097bcb2b 100644
--- a/core/src/Streamly/Internal/FileSystem/Path/Common.hs
+++ b/core/src/Streamly/Internal/FileSystem/Path/Common.hs
@@ -36,13 +36,22 @@ module Streamly.Internal.FileSystem.Path.Common
 
     , append
     , unsafeAppend
+    , splitRoot
+ -- , dropRoot
+ -- , joinRoot
     , splitPath
     , unsafeJoinPaths
+ -- , processParentRefs
 
     -- * Utilities
     , wordToChar
     , charToWord
     , unsafeIndexChar
+
+    -- * Internal
+    , unsafeSplitTopLevel
+    , unsafeSplitDrive
+    , unsafeSplitUNC
     )
 where
 
@@ -255,12 +264,18 @@ isSeparatorWord os = isSeparator os . wordToChar
 -- Path normalization
 ------------------------------------------------------------------------------
 
-countTrailingBy :: Unbox a => (a -> Bool) -> Array a -> Int
-countTrailingBy p arr =
+countWhile :: (a -> Bool) -> Stream Identity a -> Int
+countWhile p =
       runIdentity
-    $ Stream.fold Fold.length
-    $ Stream.takeWhile p
-    $ Array.readRev arr
+    . Stream.fold Fold.length
+    . Stream.takeWhile p
+
+{-# INLINE countLeadingBy #-}
+countLeadingBy :: Unbox a => (a -> Bool) -> Array a -> Int
+countLeadingBy p = countWhile p . Array.read
+
+countTrailingBy :: Unbox a => (a -> Bool) -> Array a -> Int
+countTrailingBy p = countWhile p . Array.readRev
 
 -- | If the path is @//@ the result is @/@. If it is @a//@ then the result is
 -- @a@.
@@ -304,60 +319,83 @@ unsafeHasDrive a
 
 -- | A path that starts with a alphabet followed by a colon e.g. @C:...@.
 hasDrive :: (Unbox a, Integral a) => Array a -> Bool
-hasDrive a = Array.byteLength a >= 2 && unsafeHasDrive a
+hasDrive a = Array.length a >= 2 && unsafeHasDrive a
 
 -- | A path that contains only an alphabet followed by a colon e.g. @C:@.
 isDrive :: (Unbox a, Integral a) => Array a -> Bool
-isDrive a = Array.byteLength a == 2 && unsafeHasDrive a
+isDrive a = Array.length a == 2 && unsafeHasDrive a
 
 ------------------------------------------------------------------------------
 -- Relative or Absolute
 ------------------------------------------------------------------------------
 
 -- | A path relative to cur dir it is either @.@ or starts with @./@.
-isRelativeCurDir :: (Unbox a, Integral a) => Array a -> Bool
-isRelativeCurDir a
-    -- Assuming the path is not empty.
+isRelativeCurDir :: (Unbox a, Integral a) => OS -> Array a -> Bool
+isRelativeCurDir os a
+    | len == 0 = False -- empty path should not occur
     | wordToChar (Array.getIndexUnsafe 0 a) /= '.' = False
-    | Array.byteLength a < 2 = True
-    | otherwise = isSeparator Windows (wordToChar (Array.getIndexUnsafe 1 a))
+    | len < 2 = True
+    | otherwise = isSeparatorWord os (Array.getIndexUnsafe 1 a)
+
+    where
+
+    len = Array.length a
+
+-- | A path starting with a separator.
+hasLeadingSeparator :: (Unbox a, Integral a) => OS -> Array a -> Bool
+hasLeadingSeparator os a
+    | Array.length a == 0 = False -- empty path should not occur
+    | isSeparatorWord os (Array.getIndexUnsafe 0 a) = True
+    | otherwise = False
+
+-- | A non-UNC path starting with a separator.
+isRelativeCurDriveRoot :: (Unbox a, Integral a) => Array a -> Bool
+isRelativeCurDriveRoot a
+    | len == 0 = False -- empty path should not occur
+    | len == 1 && sep0 = True
+    | sep0 && c0 /= c1 = True
+    | otherwise = False
+
+    where
 
--- | The path starting with a separator. On Windows this is relative to current
--- drive while on Posix this is absolute path as there is only one drive.
-isRelativeCurDrive :: (Unbox a, Integral a) => OS -> Array a -> Bool
-isRelativeCurDrive os a =
-    -- Assuming the path is not empty.
-    isSeparator os (wordToChar (Array.getIndexUnsafe 0 a))
+    len = Array.length a
+    c0 = Array.getIndexUnsafe 0 a
+    c1 = Array.getIndexUnsafe 1 a
+    sep0 = isSeparatorWord Windows c0
 
 -- | @C:@ or @C:a...@.
 isRelativeWithDrive :: (Unbox a, Integral a) => Array a -> Bool
 isRelativeWithDrive a =
     hasDrive a
-        && (  Array.byteLength a < 3
+        && (  Array.length a < 3
            || not (isSeparator Windows (unsafeIndexChar 2 a))
            )
 
 -- | @C:\...@. Note that "C:" or "C:a" is not absolute.
 isAbsoluteWithDrive :: (Unbox a, Integral a) => Array a -> Bool
 isAbsoluteWithDrive a =
-    Array.byteLength a >= 3
+    Array.length a >= 3
         && unsafeHasDrive a
         && isSeparator Windows (unsafeIndexChar 2 a)
 
--- | @\\\\...@
+-- | @\\\\...@ or @//...@
 isAbsoluteUNC :: (Unbox a, Integral a) => Array a -> Bool
 isAbsoluteUNC a
-    | Array.byteLength a < 2 = False
-    | unsafeIndexChar 0 a /= '\\' = False
-    | unsafeIndexChar 1 a /= '\\' = False
-    | otherwise = True
+    | Array.length a < 2 = False
+    | isSeparatorWord Windows c0 && c0 == c1 = True
+    | otherwise = False
+
+    where
+
+    c0 = Array.getIndexUnsafe 0 a
+    c1 = Array.getIndexUnsafe 1 a
 
 -- | Note that on Windows a path starting with a separator is relative to
 -- current drive while on Posix this is absolute path as there is only one
 -- drive.
 isAbsolute :: (Unbox a, Integral a) => OS -> Array a -> Bool
 isAbsolute Posix arr =
-    isRelativeCurDrive Posix arr
+    hasLeadingSeparator Posix arr
 isAbsolute Windows arr =
     isAbsoluteWithDrive arr || isAbsoluteUNC arr
 
@@ -377,11 +415,27 @@ isAbsolute Windows arr =
 --
 -- XXX For the untyped Path we can allow appending "./x" to other paths. We can
 -- leave this to the programmer. In typed paths we can allow "./x" in segments.
---
--- XXX C:\\ is invalid, \\share\ is invalid?
 -- XXX Empty path can be taken to mean "." except in case of UNC paths
-
--- | Rooted paths on Posix and Windows,
+--
+-- XXX "//share/x" works in powershell. But mixed forward and backward slashes
+-- do not work, it is treated as a path relative to current drive e.g.
+-- "\\/share/x" is treated as "C:/share/x".
+--
+-- Invalid paths:
+-- "C:\\\\"
+-- "C:\\\\x"
+-- "\\\\"
+-- "\\\\share"
+-- "\\\\share\\"
+-- "\\\\share\\\\"
+-- "\\\\share\\\\x"
+-- "\\\\?\\c:"
+-- "\\\\?\\c:\\\\\\"
+
+-- | Any path that starts with a separator, @./@ or a drive prefix is a rooted
+-- path.
+--
+-- Rooted paths on Posix and Windows,
 -- * @/...@ a path starting with a separator
 -- * @.@ current dir
 -- * @./...@ a location relative to current dir
@@ -399,13 +453,12 @@ isAbsolute Windows arr =
 --
 isLocation :: (Unbox a, Integral a) => OS -> Array a -> Bool
 isLocation Posix a =
-    isRelativeCurDrive Posix a
-        || isRelativeCurDir a
+    hasLeadingSeparator Posix a
+        || isRelativeCurDir Posix a
 isLocation Windows a =
-    isRelativeCurDrive Windows a
-        || isRelativeCurDir a
+    hasLeadingSeparator Windows a
+        || isRelativeCurDir Windows a
         || hasDrive a -- curdir-in-drive relative, drive absolute
-        || isAbsoluteUNC a
 
 isSegment :: (Unbox a, Integral a) => OS -> Array a -> Bool
 isSegment os = not . isLocation os
@@ -453,8 +506,8 @@ maybeFile os arr = do
 {-# INLINE doAppend #-}
 doAppend :: (Unbox a, Integral a) => OS -> Array a -> Array a -> Array a
 doAppend os a b = unsafePerformIO $ do
-    let lenA = Array.byteLength a
-        lenB = Array.byteLength b
+    let lenA = Array.length a
+        lenB = Array.length b
     assertM(lenA /= 0 && lenB /= 0)
     assertM(countTrailingBy (isSeparatorWord os) a == 0)
     let len = lenA + 1 + lenB
@@ -486,51 +539,276 @@ append :: (Unbox a, Integral a) =>
 append os toStr a b =
     withAppendCheck os toStr b (doAppend os a b)
 
+------------------------------------------------------------------------------
+-- Splitting
+------------------------------------------------------------------------------
+
+unsafeSplitPrefix :: (Unbox a, Integral a) =>
+    OS -> Int -> Array a -> (Array a, Array a)
+unsafeSplitPrefix os prefixLen arr = (drive, path)
+
+    where
+
+    len = Array.length arr
+    -- XXX Array.readFrom may be useful here
+    afterDrive = Array.getSliceUnsafe prefixLen (len - prefixLen) arr
+    n = countLeadingBy (isSeparatorWord os) afterDrive
+    cnt = prefixLen + n
+    drive = Array.getSliceUnsafe 0 cnt arr
+    path = Array.getSliceUnsafe cnt (len - cnt) arr
+
+-- XXX We can produce a normalized result for the drive during split.
+
+-- | Split a path prefixed with a separator into (drive, path) tuple.
+--
+-- >>> toListPosix (a,b) = (unpackPosix a, unpackPosix b)
+-- >>> splitPosix = toListPosix . Common.unsafeSplitTopLevel Common.Posix . packPosix
+--
+-- >>> toListWin (a,b) = (unpackWindows a, unpackWindows b)
+-- >>> splitWin = toListWin . Common.unsafeSplitTopLevel Common.Windows . packWindows
+--
+-- >>> splitPosix "/"
+-- ("/","")
+--
+-- >>> splitPosix "//"
+-- ("//","")
+--
+-- >>> splitPosix "/home"
+-- ("/","home")
+--
+-- >>> splitPosix "/home/user"
+-- ("/","home/user")
+--
+-- >>> splitWin "\\"
+-- ("\\","")
+--
+-- >>> splitWin "\\home"
+-- ("\\","home")
+unsafeSplitTopLevel :: (Unbox a, Integral a) =>
+    OS -> Array a -> (Array a, Array a)
+-- Note on Windows we should be here only when the path starts with exactly one
+-- separator, otherwise it would be UNC path. But on posix multiple separators
+-- are valid.
+unsafeSplitTopLevel os = unsafeSplitPrefix os 1
+
+-- In some cases there is no valid drive component e.g. "\\a\\b", though if we
+-- consider relative roots then we could use "\\" as the root in this case. In
+-- other cases there is no valid path component e.g. "C:" or "\\share\\" though
+-- the latter is not a valid path and in the former case we can use "." as the
+-- path component.
+--
+-- XXX Note, on windows C:\\\\x is an invalid path.
+
+-- | Split a path prefixed with drive into (drive, path) tuple.
+--
+-- >>> toList (a,b) = (unpackPosix a, unpackPosix b)
+-- >>> split = toList . Common.unsafeSplitDrive . packPosix
+--
+-- >>> split "C:"
+-- ("C:","")
+--
+-- >>> split "C:a"
+-- ("C:","a")
+--
+-- >>> split "C:\\"
+-- ("C:\\","")
+--
+-- >>> split "C:\\\\" -- this is invalid path
+-- ("C:\\\\","")
+--
+-- >>> split "C:\\\\a" -- this is invalid path
+-- ("C:\\\\","a")
+--
+-- >>> split "C:\\/a/b" -- is this valid path?
+-- ("C:\\/","a/b")
+unsafeSplitDrive :: (Unbox a, Integral a) => Array a -> (Array a, Array a)
+unsafeSplitDrive = unsafeSplitPrefix Windows 2
+
+-- | Skip separators and then parse the next path segment.
+-- Return (segment offset, segment length).
+parseSegment :: (Unbox a, Integral a) => Array a -> Int -> Int -> (Int, Int)
+parseSegment arr len sepOff = (segOff, segCnt)
+
+    where
+
+    arr1 = Array.getSliceUnsafe sepOff (len - sepOff) arr
+    sepCnt = countLeadingBy (isSeparatorWord Windows) arr1
+    segOff = sepOff + sepCnt
+
+    arr2 = Array.getSliceUnsafe segOff (len - segOff) arr
+    segCnt = countLeadingBy (not . isSeparatorWord Windows) arr2
+
+-- XXX We can split a path as "root, . , rest" or "root, /, rest".
+-- XXX We can remove the redundant path separator after the root. With that
+-- joining root vs other paths will become similar. But there are some special
+-- cases e.g. "C:a" does not have a separator, can we make this "C:.\\a"? In
+-- case of "/home" we have "/" as root and we cannot add another separator
+-- between this and the rest of the path.
+
+-- | Split a path prefixed with "\\" into (drive, path) tuple.
+--
+-- >>> toList (a,b) = (unpackPosix a, unpackPosix b)
+-- >>> split = toList . Common.unsafeSplitUNC . packPosix
+--
+-- >> split ""
+-- ("","")
+--
+-- >>> split "\\\\"
+-- ("\\\\","")
+--
+-- >>> split "\\\\server"
+-- ("\\\\server","")
+--
+-- >>> split "\\\\server\\"
+-- ("\\\\server\\","")
+--
+-- >>> split "\\\\server\\home"
+-- ("\\\\server\\","home")
+--
+-- >>> split "\\\\?\\c:"
+-- ("\\\\?\\c:","")
+--
+-- >>> split "\\\\?\\c:/"
+-- ("\\\\?\\c:/","")
+--
+-- >>> split "\\\\?\\c:\\home"
+-- ("\\\\?\\c:\\","home")
+--
+-- >>> split "\\\\?\\UNC/"
+-- ("\\\\?\\UNC/","")
+--
+-- >>> split "\\\\?\\UNC\\server"
+-- ("\\\\?\\UNC\\server","")
+--
+-- >>> split "\\\\?\\UNC/server\\home"
+-- ("\\\\?\\UNC/server\\","home")
+--
+unsafeSplitUNC :: (Unbox a, Integral a) => Array a -> (Array a, Array a)
+unsafeSplitUNC arr =
+    if cnt1 == 1 && unsafeIndexChar 2 arr == '?'
+    then do
+        if uncLen == 3
+                && unsafeIndexChar uncOff arr == 'U'
+                && unsafeIndexChar (uncOff + 1) arr == 'N'
+                && unsafeIndexChar (uncOff + 2) arr == 'C'
+        then unsafeSplitPrefix Windows (serverOff + serverLen) arr
+        else unsafeSplitPrefix Windows sepOff1 arr
+    else unsafeSplitPrefix Windows sepOff arr
+
+    where
+
+    len = Array.length arr
+    arr1 = Array.getSliceUnsafe 2 (len - 2) arr
+    cnt1 = countLeadingBy (not . isSeparatorWord Windows) arr1
+    sepOff = 2 + cnt1
+
+    -- XXX there should be only one separator in a valid path?
+    -- XXX it should either be UNC or two letter drive in a valid path
+    (uncOff, uncLen) = parseSegment arr len sepOff
+    sepOff1 = uncOff + uncLen
+    (serverOff, serverLen) = parseSegment arr len sepOff1
+
+-- XXX should we make the root Maybe? Both components will have to be Maybe to
+-- avoid an empty path.
+
+-- | If a path is rooted then separate the root and the remaining path
+-- otherwise root is returned as empty.
+--
+-- >>> toList (a,b) = (unpackPosix a, unpackPosix b)
+-- >>> splitPosix = toList . Common.splitRoot Common.Posix . packPosix
+--
+-- >>> splitPosix "/"
+-- ("/","")
+--
+-- >>> splitPosix "."
+-- (".","")
+--
+-- >>> splitPosix "/home"
+-- ("/","home")
+--
+-- >>> splitPosix "//"
+-- ("//","")
+--
+-- >>> splitPosix "./home"
+-- ("./","home")
+--
+-- >>> splitPosix "home"
+-- ("","home")
+--
+{-# INLINE splitRoot #-}
+splitRoot :: (Unbox a, Integral a) => OS -> Array a -> (Array a, Array a)
+splitRoot Posix arr
+    | isLocation Posix arr
+        = unsafeSplitTopLevel Posix arr
+    | otherwise = (Array.empty, arr)
+splitRoot Windows arr
+    | isRelativeCurDriveRoot arr || isRelativeCurDir Windows arr
+        = unsafeSplitTopLevel Windows arr
+    | hasDrive arr = unsafeSplitDrive arr
+    | isAbsoluteUNC arr = unsafeSplitUNC arr
+    | otherwise = (Array.empty, arr)
+
 -- | Split a path into components separated by the path separator. "."
 -- components in the path are ignored except in the leading position. Multiple
 -- consecutive separators are ignored.
 --
 -- >>> :{
---  splitPath Common.Posix = Stream.toList . fmap unpackPosix . Common.splitPath Common.Posix . packPosix
---  splitPath Common.Windows = Stream.toList . fmap unpackWindows . Common.splitPath Common.Windows . packWindows
+--  splitPosix = Stream.toList . fmap unpackPosix . Common.splitPath Common.Posix . packPosix
+--  splitWin = Stream.toList . fmap unpackWindows . Common.splitPath Common.Windows . packWindows
 -- :}
 --
--- >>> splitPath Common.Posix "."
+-- >>> splitPosix "."
 -- ["."]
 --
--- >>> splitPath Common.Posix "././"
--- ["."]
+-- >>> splitPosix "././"
+-- ["./"]
 --
--- >>> splitPath Common.Posix "./a/b/."
--- [".","a","b"]
+-- >>> splitPosix "./a/b/."
+-- ["./","a","b"]
 --
--- >>> splitPath Common.Posix "home//user/./..////\\directory/."
+-- >>> splitPosix "/"
+-- ["/"]
+--
+-- >>> splitPosix "/home"
+-- ["/","home"]
+--
+-- >>> splitWin "/home"
+-- ["/","home"]
+--
+-- >>> splitPosix "home//user/./..////\\directory/."
 -- ["home","user","..","\\directory"]
 --
--- >>> splitPath Common.Windows "home//user/./..////\\directory/."
+-- >>> splitWin "home//user/./..////\\directory/."
 -- ["home","user","..","directory"]
 --
 {-# INLINE splitPath #-}
 splitPath
-    :: forall a m. (Unbox a, Integral a, MonadIO m)
+    :: (Unbox a, Integral a, MonadIO m)
     => OS -> Array a -> Stream m (Array a)
 splitPath os arr =
-    Stream.indexEndBy_ (isSeparatorWord os) (Array.read arr)
-        & Stream.filter (not . shouldFilterOut)
-        & fmap (\(i, len) -> Array.getSliceUnsafe i len arr)
+    let stream =
+              Stream.indexEndBy_ (isSeparatorWord os) (Array.read rest)
+            & Stream.filter (not . shouldFilterOut)
+            & fmap (\(i, len) -> Array.getSliceUnsafe i len rest)
+
+    in if Array.length root == 0
+       then stream
+       else Stream.cons root stream
 
     where
 
+    (root, rest) = splitRoot os arr
+
     shouldFilterOut (off, len) =
         len == 0 ||
-            (len == 1 && unsafeIndexChar off arr == '.')
+            (len == 1 && unsafeIndexChar off rest == '.')
 
 -- | Join paths by path separator. Does not check if the paths being appended
 -- are rooted or path segments. Note that splitting and joining may not give
 -- exactly the original path but an equivalent, normalized path.
 {-# INLINE unsafeJoinPaths #-}
 unsafeJoinPaths
-    :: forall a m. (Unbox a, Integral a, MonadIO m)
+    :: (Unbox a, Integral a, MonadIO m)
     => OS -> Stream m (Array a) -> m (Array a)
 unsafeJoinPaths os =
     -- XXX This can be implemented more efficiently using an Array intersperse