From 4f625b4fea5bd43569e66a9b59f7591856c48cf9 Mon Sep 17 00:00:00 2001 From: FaustoNascimento Date: Thu, 18 Aug 2016 13:48:18 +0200 Subject: [PATCH] Multiple changes below Added RowCount and ColumnCount parameters Added IgnoreEmptyCells parameter (if used, the object will not have empty properties. If the resulting object contains no properties at all (because they are all empty), then no object will be sent to the pipeline. Both RowCount and ColumnCount allow the user to specify past the used boundaries of the worksheet, which will start outputting either objects with all blank properties (if the row is past the used boundaries) or objects will contain extra empty properties (if the column is past the used boundaries). This is a powerful tool that simplifies say... adding a column to an existing Excel worksheet: just increase the number of columns to return by one, set it's value on the objects returns and export it back to Excel. Be aware that if used in conjunction with IgnoreEmptyCells the bounds of the worksheet will be enforced (and obviously any empty cells in the middle will be ignored) Added RowHeader, so you can now specify which row to use as the header. Defaults to row 1 Added some nicer logic to the headers: Header array will always be used first, but its size no longer needs to match the number of columns to read If the Header array is not specified, or the columns to read exceeds the Header array limit, it goes back to using the RowHeader value (if FirstRowIsData is not set). If the FirstRowData is set, the column's header for a particular column is empty or a Header element is empty, it will be replaced with ColumnX, where X is one or more letters with a unique identifier to the column (same as Excel does) - Example: ColumnAX Lastly, if a header from the RowHeader is repeated, an index will be added to it. Example: if there are two columns named Test, on importing the first will be named Test the second will be named Test1. If there were a third, it would be Test2, etc. If a header from the Header array is repeated it will behave as if it was empty (i.e., try to first use the RowHeader for that column or ColumnX naming notation). If an excel file contains empty rows/columns to the left (i.e, before the data actually starts) they will be ignored by default. Use -RowStart or -ColumnStart to override this. --- PSExcel/Import-XLSX.ps1 | 195 ++++++++++++++++++++++++---------------- 1 file changed, 118 insertions(+), 77 deletions(-) diff --git a/PSExcel/Import-XLSX.ps1 b/PSExcel/Import-XLSX.ps1 index cdeb9ff..97d0121 100644 --- a/PSExcel/Import-XLSX.ps1 +++ b/PSExcel/Import-XLSX.ps1 @@ -1,76 +1,54 @@ -function Import-XLSX { +function Import-XLSX { <# .SYNOPSIS Import data from Excel - .DESCRIPTION Import data from Excel - .PARAMETER Path Path to an xlsx file to import - .PARAMETER Sheet Index or name of Worksheet to import - .PARAMETER Header Replacement headers. Must match order and count of your data's properties. - .PARAMETER RowStart First row to start reading from, typically the header. Default is 1 - .PARAMETER ColumnStart First column to start reading from. Default is 1 - .PARAMETER FirstRowIsData Indicates that the first row is data, not headers. Must be used with -Header. - .PARAMETER Text Extract cell text, rather than value. - For example, if you have a cell with value 5: If the Number Format is '0', the text would be 5 If the Number Format is 0.00, the text would be 5.00 - .EXAMPLE Import-XLSX -Path "C:\Excel.xlsx" - #Import data from C:\Excel.xlsx - .EXAMPLE Import-XLSX -Path "C:\Excel.xlsx" -Header One, Two, Five - # Import data from C:\Excel.xlsx # Replace headers with One, Two, Five - .EXAMPLE Import-XLSX -Path "C:\Excel.xlsx" -Header One, Two, Five -FirstRowIsData -Sheet 2 - # Import data from C:\Excel.xlsx # Assume first row is data # Use headers One, Two, Five # Pull from sheet 2 (sheet 1 is default) - .EXAMPLE # A B C # 1 Random text to mess with you! # 2 Header1 Header2 Header3 # 3 data1 Data2 Data3 - # Your worksheet has data you don't care about in the first row or column # Use the ColumnStart or RowStart parameters to solve this. - Import-XLSX -Path C:\RandomTextInRow1.xlsx -RowStart 2 - .NOTES Thanks to Doug Finke for his example: https://github.com/dfinke/ImportExcel/blob/master/ImportExcel.psm1 - Thanks to Philip Thompson for an expansive set of examples on working with EPPlus in PowerShell: https://excelpslib.codeplex.com/ - .LINK https://github.com/RamblingCookieMonster/PSExcel - .FUNCTIONALITY Excel #> @@ -88,12 +66,47 @@ [switch]$FirstRowIsData, - [switch]$Text, + [ValidateSet('Text', 'Value')] + [string]$Interpreter = 'Value', + + [UInt32]$RowStart, + + [UInt32]$ColumnStart, - [int]$RowStart = 1, + [UInt32]$RowCount, + + [UInt32]$ColumnCount, - [int]$ColumnStart = 1 + [switch]$IgnoreEmptyCells, + + [UInt32]$RowHeader ) + Begin + { + function ColumnNumberToColumnName([uint64] $ColumnNumber) + { + while ($ColumnNumber -gt 0) + { + $Modulo = ($ColumnNumber - 1) % 26 + $ColumnName = [Char] (65 + $Modulo) + $ColumnName; + $ColumnNumber = [uint64](($ColumnNumber - $Modulo) / 26); + } + + $ColumnName + } + + function ColumnNameToColumnNumber([string] $ColumnName) + { + for ($i = 0; $i -lt $ColumnName.Length; $i++) + { + $sum *= 26; + $sum += ($ColumnName[$i] - 65 + 1); + } + + $sum + } + } + Process { foreach($file in $path) @@ -101,8 +114,8 @@ #Resolve relative paths... Thanks Oisin! http://stackoverflow.com/a/3040982/3067642 $file = $ExecutionContext.SessionState.Path.GetUnresolvedProviderPathFromPSPath($file) - write-verbose "target excel file $($file)" - + Write-Verbose "target excel file $($file)" + Try { $xl = New-Object OfficeOpenXml.ExcelPackage $file @@ -123,13 +136,31 @@ else { $worksheet = $workbook.Worksheets[$Sheet] - $dimension = $worksheet.Dimension + + if (-not $worksheet.Dimension) + { + Write-Verbose "Worksheet is empty" + continue + } + + $ranges = [regex]::Matches($worksheet.Dimension.Address, '[A-Za-z]+|\d+') + + if (-not $ColumnStart) + { + $ColumnStart = ColumnNameToColumnNumber $ranges[0].Value + } - $Rows = $dimension.Rows - $Columns = $dimension.Columns + if (-not $RowStart) + { + $RowStart = $ranges[1].Value + } + + $ColumnEnd = if ($ColumnCount) {$ColumnCount + $ColumnStart} else {ColumnNameToColumnNumber $ranges[2].Value} + $RowEnd = if ($RowCount) {$RowCount + $RowStart} else {$ranges[3].Value} + $RowHeader = if ($RowHeader) {$RowHeader} else {$RowStart} - $ColumnEnd = $Columns + $ColumnStart - 1 - $RowEnd = $Rows + $RowStart - 1 + $RowCount = $RowEnd - $RowStart + 1 + $ColumnCount = $ColumnEnd - $ColumnStart + 1 } } @@ -139,64 +170,66 @@ continue } - if($Header -and $Header.count -gt 0) + # Define headears + $Headers = @() + + foreach ($i in $ColumnStart..$ColumnEnd) { - if($Header.count -ne $Columns) + if ($Header -and -not [string]::IsNullOrEmpty($Header[$i - $ColumnStart]) -and $Header[$i - $ColumnStart] -notin $Headers) { - Write-Error "Found '$columns' columns, provided $($header.count) headers. You must provide a header for every column." + $Headers += $Header[$i - $ColumnStart] + continue } - Write-Verbose "User defined headers: $Header" - } - else - { - $Header = @( foreach ($Column in $ColumnStart..$ColumnEnd) + + $Value = ([string] $worksheet.Cells.Item($RowHeader,$i).$Interpreter).Trim() + + if ([string]::IsNullOrEmpty($Value) -or $FirstRowIsData) { - if($Text) - { - $PotentialHeader = $worksheet.Cells.Item($RowStart,$Column).Text - } - else - { - $PotentialHeader = $worksheet.Cells.Item($RowStart,$Column).Value - } + $Headers += "Column$(ColumnNumberToColumnName $i)" + continue + } - if( -Not $PotentialHeader -Or $PotentialHeader.Trim().Equals("") ) - { - Write-Warning "Header in column $Column is whitespace or empty, setting header to ''" - $PotentialHeader = "" # Use placeholder name - } - $PotentialHeader - }) + $i = 1 + $originalValue = $Value + + while ($Value -in $Headers) + { + $Value = "$originalValue$i" + $i++ + } + + $Headers += $Value } - [string[]]$SelectedHeaders = @( $Header | select -Unique ) - Write-Verbose "Found $Rows rows, $Columns columns, with headers:`n$($Header | Out-String)" + Write-Verbose "Will read $RowCount rows, $ColumnCount columns, with headers:`n$($Header | Out-String)" + + $typeName = "Excel$(([System.IO.FileInfo]$File).BaseName)" + Update-TypeData -DefaultDisplayPropertySet $Headers -TypeName $typeName -Force if(-not $FirstRowIsData) { $RowStart++ - } - foreach ($Row in $RowStart..$RowEnd) + if ($RowStart -gt $RowEnd) + { + continue + } + } + + foreach ($RowId in $RowStart..$RowEnd) { $RowData = @{} + $RowHeaders = @() - foreach ($Column in 0..($Columns - 1)) + foreach ($ColumnId in $ColumnStart..$ColumnEnd) { - $Name = $Header[$Column] - if($Text) - { - $Value = $worksheet.Cells.Item($Row, ($Column + $ColumnStart)).Text - } - else - { - $Value = $worksheet.Cells.Item($Row, ($Column + $ColumnStart)).Value - } - - Write-Debug "Row: $Row, Column: $Column, Name: $Name, Value = $Value" + $Name = $Headers[$ColumnId - $ColumnStart] + + $Value = $worksheet.Cells.Item($RowId, $ColumnId).$Interpreter + Write-Debug "Row: $RowId, Column: $ColumnId, Name: $Name, Value = $Value" #Handle dates, they're too common to overlook... Could use help, not sure if this is the best regex to use? - $Format = $worksheet.Cells.Item($Row, ($Column + $ColumnStart)).style.numberformat.format + $Format = $worksheet.Cells.Item($RowId, $ColumnId).style.numberformat.format if($Format -match '\w{1,4}/\w{1,2}/\w{1,4}( \w{1,2}:\w{1,2})?') { Try @@ -209,20 +242,28 @@ } } - if($RowData.ContainsKey($Name) ) + if ($IgnoreEmptyCells -and [string]::IsNullOrEmpty($Value)) { - Write-Warning "Duplicate header for '$Name' found, with value '$Value', in row $Row" + Write-Verbose "Ignoring empty cell on row $RowId and column $ColumnId" } else { $RowData.Add($Name, $Value) + $RowHeaders += $Name } } - New-Object -TypeName PSObject -Property $RowData | Select -Property $SelectedHeaders + + if (@($psObject.PSObject.Properties).Count -gt 0) + { + $psObject = New-Object -TypeName PSObject -Property $RowData + $psObject = $psObject | Select $RowHeaders + $psObject.PSTypeNames.Insert(0, $typeName) + $psObject + } } $xl.Dispose() $xl = $null } } -} \ No newline at end of file +}