PowerShell script to create Excel spreadsheet from HTML file
up vote
6
down vote
favorite
This script is designed to create an Excel spreadsheet from each HTML table from auto-generated index.html
file.
It needs to be very robust but unfortunately I don't have many example files to test it against. Any feedback on style, possible points of failure, or vulnerabilities would be very appreciated.
Powershell
function Main
{
# find each file called "index.html" in the folders below the directory of the script
$Directory = $PSScriptRoot
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
# for each html file, create an excel file in the directory of the script
# called Exceptions Log <date of HTML file creation>.xlsx
ForEach($htmlFile in $htmlFiles)
{
$fileCreationDate = (Get-ChildItem $htmlFile).CreationTime
$fileString = $fileCreationDate.ToString("yyyy-mm-dd hh.mm.ss")
$outputFile = "$DirectoryExceptions Log $fileString.xlsx"
Write-Host "Parsing HTML file: $htmlFile"
$document = ParseHtml $htmlFile
Write-Host "Creating Excel file: $outputFile"
CreateExcelFromHtml $document $outputFile
}
}
# load local HTML file and return COM object that we can query
function ParseHtml([string]$fileIn)
{
$html = New-Object -ComObject "HTMLFile"
$source = Get-Content $fileIn -Raw
$html.IHTMLDocument2_write($source)
return $html
}
# create excel spreadsheet from html document
function CreateExcelFromHtml($document, [string]$excelFile)
{
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false # $true
$excel.DisplayAlerts = $false
$workbook = $excel.Workbooks.Add()
$sheet = $workbook.ActiveSheet
Write-Host "Filling in Excel file"
$row = 1;
# iterate over tables
$tables = $document.getElementsByTagName("table")
$captions = $document.getElementsByTagName("caption")
for($t=0; $t -lt $tables.length; $t++){
# create 2 row space between tables
$row += 2
#iterate over rows
$rows = $tables[$t].getElementsByTagName("tr")
for($i=0; $i -lt $rows.length; $i++)
{
# iterate over cells
# every other cell is a blank cell, however cells with content are always nested in a "font" tag
$cells = $rows[$i].getElementsByTagName("font")
for($j=0; $j -lt $cells.length; $j++)
{
# load each cell contents into an excel cell
$sheet.Cells.Item($row + $i, $j+1) = $cells[$j].innerText.Trim()
}
}
# add corrective action column to first table
if($t -eq 0){
$j += 1 # increment table size
$sheet.Cells.Item($row, $j) = "Corrective Action";
}
# get table title from corresponding caption
$title = [string]$captions[$t].innerText
StyleTable $sheet $title $row $i $j
$row += $i
}
Write-Host "Formating Excel file"
$sheet.UsedRange.ColumnWidth = 18
Write-Host "Saving Excel file"
$workbook.SaveAs($excelFile, 51)
$workbook.Close();
$excel.Quit()
}
Function StyleTable($sheet, $tableTitle, $startRow, $numberOfRows, $endColumn){
# give the table a title and style it
$titleCell = $sheet.Cells($startRow - 1, 1)
$titleCell.Value = $tableTitle
$titleCell.Font.Name = "Cambria"
$titleCell.Font.Size = 14
$titleCell.Font.Bold = $true
$titleCell.Font.ThemeColor = 5
# give the table a border
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
$tableRange.Borders.Weight = 2
# make the table header bold and blue
$headerRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow, $endColumn))
$headerRange.Interior.ColorIndex = 37
$headerRange.Font.Bold = $true
}
Main
Example index.html
file
<html>
<head>
<title>FooBar</title>
</head>
<body>
<center>
<p>
<b><font size=6>Bar</font>
</b><br>
<b><font size=4>Foo</font>
</b><br>
<font size=3>FooBar</font>
</p>
<table>
<caption><font size=3><b>Title</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>A</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>B</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>C</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>D</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>E</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>F</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>G</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>1</td><td> </td>
<td valign=top><font size=2>2</td><td> </td>
<td valign=top><font size=2>3</td><td> </td>
<td valign=top><font size=2>4</td><td> </td>
<td valign=top><font size=2>5</td><td> </td>
<td valign=top><font size=2>6</td><td> </td>
<td valign=top><font size=2>7</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
</tbody>
</table>
<br>
<table>
<caption><font size=3><b>Title 2</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Foo</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Bar</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FoobBar</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
<td valign=top><font size=2>FizzBuzz<br>
<br>
</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>FooBar<br>
<br>
</td><td> </td>
</tr>
</tbody>
</table>
<br>
<table>
<caption><font size=3><b>Title 3</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Fizz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Buzz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FizzBuzz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FooBar</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>FooBar</td><td> </td>
<td valign=top><font size=2>FizzBuzz</td><td> </td>
</tr>
</tbody>
</table>
<br>
<p>
<b><font size=4>Something</font>
</b><br>
</p>
<table>
<caption><font size=3><b>Title 4</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Foo</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Bar</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Fizz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Buzz</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
</tr>
<tr>
<b><td valign=top><font size=2>Foo</td><td> </td>
</b><td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
</tr>
</tbody>
</table>
<br>
</center>
</body>
</html>
Output
html excel powershell
add a comment |
up vote
6
down vote
favorite
This script is designed to create an Excel spreadsheet from each HTML table from auto-generated index.html
file.
It needs to be very robust but unfortunately I don't have many example files to test it against. Any feedback on style, possible points of failure, or vulnerabilities would be very appreciated.
Powershell
function Main
{
# find each file called "index.html" in the folders below the directory of the script
$Directory = $PSScriptRoot
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
# for each html file, create an excel file in the directory of the script
# called Exceptions Log <date of HTML file creation>.xlsx
ForEach($htmlFile in $htmlFiles)
{
$fileCreationDate = (Get-ChildItem $htmlFile).CreationTime
$fileString = $fileCreationDate.ToString("yyyy-mm-dd hh.mm.ss")
$outputFile = "$DirectoryExceptions Log $fileString.xlsx"
Write-Host "Parsing HTML file: $htmlFile"
$document = ParseHtml $htmlFile
Write-Host "Creating Excel file: $outputFile"
CreateExcelFromHtml $document $outputFile
}
}
# load local HTML file and return COM object that we can query
function ParseHtml([string]$fileIn)
{
$html = New-Object -ComObject "HTMLFile"
$source = Get-Content $fileIn -Raw
$html.IHTMLDocument2_write($source)
return $html
}
# create excel spreadsheet from html document
function CreateExcelFromHtml($document, [string]$excelFile)
{
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false # $true
$excel.DisplayAlerts = $false
$workbook = $excel.Workbooks.Add()
$sheet = $workbook.ActiveSheet
Write-Host "Filling in Excel file"
$row = 1;
# iterate over tables
$tables = $document.getElementsByTagName("table")
$captions = $document.getElementsByTagName("caption")
for($t=0; $t -lt $tables.length; $t++){
# create 2 row space between tables
$row += 2
#iterate over rows
$rows = $tables[$t].getElementsByTagName("tr")
for($i=0; $i -lt $rows.length; $i++)
{
# iterate over cells
# every other cell is a blank cell, however cells with content are always nested in a "font" tag
$cells = $rows[$i].getElementsByTagName("font")
for($j=0; $j -lt $cells.length; $j++)
{
# load each cell contents into an excel cell
$sheet.Cells.Item($row + $i, $j+1) = $cells[$j].innerText.Trim()
}
}
# add corrective action column to first table
if($t -eq 0){
$j += 1 # increment table size
$sheet.Cells.Item($row, $j) = "Corrective Action";
}
# get table title from corresponding caption
$title = [string]$captions[$t].innerText
StyleTable $sheet $title $row $i $j
$row += $i
}
Write-Host "Formating Excel file"
$sheet.UsedRange.ColumnWidth = 18
Write-Host "Saving Excel file"
$workbook.SaveAs($excelFile, 51)
$workbook.Close();
$excel.Quit()
}
Function StyleTable($sheet, $tableTitle, $startRow, $numberOfRows, $endColumn){
# give the table a title and style it
$titleCell = $sheet.Cells($startRow - 1, 1)
$titleCell.Value = $tableTitle
$titleCell.Font.Name = "Cambria"
$titleCell.Font.Size = 14
$titleCell.Font.Bold = $true
$titleCell.Font.ThemeColor = 5
# give the table a border
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
$tableRange.Borders.Weight = 2
# make the table header bold and blue
$headerRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow, $endColumn))
$headerRange.Interior.ColorIndex = 37
$headerRange.Font.Bold = $true
}
Main
Example index.html
file
<html>
<head>
<title>FooBar</title>
</head>
<body>
<center>
<p>
<b><font size=6>Bar</font>
</b><br>
<b><font size=4>Foo</font>
</b><br>
<font size=3>FooBar</font>
</p>
<table>
<caption><font size=3><b>Title</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>A</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>B</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>C</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>D</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>E</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>F</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>G</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>1</td><td> </td>
<td valign=top><font size=2>2</td><td> </td>
<td valign=top><font size=2>3</td><td> </td>
<td valign=top><font size=2>4</td><td> </td>
<td valign=top><font size=2>5</td><td> </td>
<td valign=top><font size=2>6</td><td> </td>
<td valign=top><font size=2>7</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
</tbody>
</table>
<br>
<table>
<caption><font size=3><b>Title 2</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Foo</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Bar</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FoobBar</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
<td valign=top><font size=2>FizzBuzz<br>
<br>
</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>FooBar<br>
<br>
</td><td> </td>
</tr>
</tbody>
</table>
<br>
<table>
<caption><font size=3><b>Title 3</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Fizz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Buzz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FizzBuzz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FooBar</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>FooBar</td><td> </td>
<td valign=top><font size=2>FizzBuzz</td><td> </td>
</tr>
</tbody>
</table>
<br>
<p>
<b><font size=4>Something</font>
</b><br>
</p>
<table>
<caption><font size=3><b>Title 4</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Foo</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Bar</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Fizz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Buzz</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
</tr>
<tr>
<b><td valign=top><font size=2>Foo</td><td> </td>
</b><td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
</tr>
</tbody>
</table>
<br>
</center>
</body>
</html>
Output
html excel powershell
add a comment |
up vote
6
down vote
favorite
up vote
6
down vote
favorite
This script is designed to create an Excel spreadsheet from each HTML table from auto-generated index.html
file.
It needs to be very robust but unfortunately I don't have many example files to test it against. Any feedback on style, possible points of failure, or vulnerabilities would be very appreciated.
Powershell
function Main
{
# find each file called "index.html" in the folders below the directory of the script
$Directory = $PSScriptRoot
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
# for each html file, create an excel file in the directory of the script
# called Exceptions Log <date of HTML file creation>.xlsx
ForEach($htmlFile in $htmlFiles)
{
$fileCreationDate = (Get-ChildItem $htmlFile).CreationTime
$fileString = $fileCreationDate.ToString("yyyy-mm-dd hh.mm.ss")
$outputFile = "$DirectoryExceptions Log $fileString.xlsx"
Write-Host "Parsing HTML file: $htmlFile"
$document = ParseHtml $htmlFile
Write-Host "Creating Excel file: $outputFile"
CreateExcelFromHtml $document $outputFile
}
}
# load local HTML file and return COM object that we can query
function ParseHtml([string]$fileIn)
{
$html = New-Object -ComObject "HTMLFile"
$source = Get-Content $fileIn -Raw
$html.IHTMLDocument2_write($source)
return $html
}
# create excel spreadsheet from html document
function CreateExcelFromHtml($document, [string]$excelFile)
{
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false # $true
$excel.DisplayAlerts = $false
$workbook = $excel.Workbooks.Add()
$sheet = $workbook.ActiveSheet
Write-Host "Filling in Excel file"
$row = 1;
# iterate over tables
$tables = $document.getElementsByTagName("table")
$captions = $document.getElementsByTagName("caption")
for($t=0; $t -lt $tables.length; $t++){
# create 2 row space between tables
$row += 2
#iterate over rows
$rows = $tables[$t].getElementsByTagName("tr")
for($i=0; $i -lt $rows.length; $i++)
{
# iterate over cells
# every other cell is a blank cell, however cells with content are always nested in a "font" tag
$cells = $rows[$i].getElementsByTagName("font")
for($j=0; $j -lt $cells.length; $j++)
{
# load each cell contents into an excel cell
$sheet.Cells.Item($row + $i, $j+1) = $cells[$j].innerText.Trim()
}
}
# add corrective action column to first table
if($t -eq 0){
$j += 1 # increment table size
$sheet.Cells.Item($row, $j) = "Corrective Action";
}
# get table title from corresponding caption
$title = [string]$captions[$t].innerText
StyleTable $sheet $title $row $i $j
$row += $i
}
Write-Host "Formating Excel file"
$sheet.UsedRange.ColumnWidth = 18
Write-Host "Saving Excel file"
$workbook.SaveAs($excelFile, 51)
$workbook.Close();
$excel.Quit()
}
Function StyleTable($sheet, $tableTitle, $startRow, $numberOfRows, $endColumn){
# give the table a title and style it
$titleCell = $sheet.Cells($startRow - 1, 1)
$titleCell.Value = $tableTitle
$titleCell.Font.Name = "Cambria"
$titleCell.Font.Size = 14
$titleCell.Font.Bold = $true
$titleCell.Font.ThemeColor = 5
# give the table a border
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
$tableRange.Borders.Weight = 2
# make the table header bold and blue
$headerRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow, $endColumn))
$headerRange.Interior.ColorIndex = 37
$headerRange.Font.Bold = $true
}
Main
Example index.html
file
<html>
<head>
<title>FooBar</title>
</head>
<body>
<center>
<p>
<b><font size=6>Bar</font>
</b><br>
<b><font size=4>Foo</font>
</b><br>
<font size=3>FooBar</font>
</p>
<table>
<caption><font size=3><b>Title</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>A</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>B</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>C</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>D</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>E</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>F</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>G</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>1</td><td> </td>
<td valign=top><font size=2>2</td><td> </td>
<td valign=top><font size=2>3</td><td> </td>
<td valign=top><font size=2>4</td><td> </td>
<td valign=top><font size=2>5</td><td> </td>
<td valign=top><font size=2>6</td><td> </td>
<td valign=top><font size=2>7</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
</tbody>
</table>
<br>
<table>
<caption><font size=3><b>Title 2</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Foo</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Bar</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FoobBar</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
<td valign=top><font size=2>FizzBuzz<br>
<br>
</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>FooBar<br>
<br>
</td><td> </td>
</tr>
</tbody>
</table>
<br>
<table>
<caption><font size=3><b>Title 3</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Fizz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Buzz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FizzBuzz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FooBar</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>FooBar</td><td> </td>
<td valign=top><font size=2>FizzBuzz</td><td> </td>
</tr>
</tbody>
</table>
<br>
<p>
<b><font size=4>Something</font>
</b><br>
</p>
<table>
<caption><font size=3><b>Title 4</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Foo</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Bar</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Fizz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Buzz</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
</tr>
<tr>
<b><td valign=top><font size=2>Foo</td><td> </td>
</b><td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
</tr>
</tbody>
</table>
<br>
</center>
</body>
</html>
Output
html excel powershell
This script is designed to create an Excel spreadsheet from each HTML table from auto-generated index.html
file.
It needs to be very robust but unfortunately I don't have many example files to test it against. Any feedback on style, possible points of failure, or vulnerabilities would be very appreciated.
Powershell
function Main
{
# find each file called "index.html" in the folders below the directory of the script
$Directory = $PSScriptRoot
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
# for each html file, create an excel file in the directory of the script
# called Exceptions Log <date of HTML file creation>.xlsx
ForEach($htmlFile in $htmlFiles)
{
$fileCreationDate = (Get-ChildItem $htmlFile).CreationTime
$fileString = $fileCreationDate.ToString("yyyy-mm-dd hh.mm.ss")
$outputFile = "$DirectoryExceptions Log $fileString.xlsx"
Write-Host "Parsing HTML file: $htmlFile"
$document = ParseHtml $htmlFile
Write-Host "Creating Excel file: $outputFile"
CreateExcelFromHtml $document $outputFile
}
}
# load local HTML file and return COM object that we can query
function ParseHtml([string]$fileIn)
{
$html = New-Object -ComObject "HTMLFile"
$source = Get-Content $fileIn -Raw
$html.IHTMLDocument2_write($source)
return $html
}
# create excel spreadsheet from html document
function CreateExcelFromHtml($document, [string]$excelFile)
{
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false # $true
$excel.DisplayAlerts = $false
$workbook = $excel.Workbooks.Add()
$sheet = $workbook.ActiveSheet
Write-Host "Filling in Excel file"
$row = 1;
# iterate over tables
$tables = $document.getElementsByTagName("table")
$captions = $document.getElementsByTagName("caption")
for($t=0; $t -lt $tables.length; $t++){
# create 2 row space between tables
$row += 2
#iterate over rows
$rows = $tables[$t].getElementsByTagName("tr")
for($i=0; $i -lt $rows.length; $i++)
{
# iterate over cells
# every other cell is a blank cell, however cells with content are always nested in a "font" tag
$cells = $rows[$i].getElementsByTagName("font")
for($j=0; $j -lt $cells.length; $j++)
{
# load each cell contents into an excel cell
$sheet.Cells.Item($row + $i, $j+1) = $cells[$j].innerText.Trim()
}
}
# add corrective action column to first table
if($t -eq 0){
$j += 1 # increment table size
$sheet.Cells.Item($row, $j) = "Corrective Action";
}
# get table title from corresponding caption
$title = [string]$captions[$t].innerText
StyleTable $sheet $title $row $i $j
$row += $i
}
Write-Host "Formating Excel file"
$sheet.UsedRange.ColumnWidth = 18
Write-Host "Saving Excel file"
$workbook.SaveAs($excelFile, 51)
$workbook.Close();
$excel.Quit()
}
Function StyleTable($sheet, $tableTitle, $startRow, $numberOfRows, $endColumn){
# give the table a title and style it
$titleCell = $sheet.Cells($startRow - 1, 1)
$titleCell.Value = $tableTitle
$titleCell.Font.Name = "Cambria"
$titleCell.Font.Size = 14
$titleCell.Font.Bold = $true
$titleCell.Font.ThemeColor = 5
# give the table a border
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
$tableRange.Borders.Weight = 2
# make the table header bold and blue
$headerRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow, $endColumn))
$headerRange.Interior.ColorIndex = 37
$headerRange.Font.Bold = $true
}
Main
Example index.html
file
<html>
<head>
<title>FooBar</title>
</head>
<body>
<center>
<p>
<b><font size=6>Bar</font>
</b><br>
<b><font size=4>Foo</font>
</b><br>
<font size=3>FooBar</font>
</p>
<table>
<caption><font size=3><b>Title</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>A</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>B</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>C</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>D</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>E</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>F</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>G</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>1</td><td> </td>
<td valign=top><font size=2>2</td><td> </td>
<td valign=top><font size=2>3</td><td> </td>
<td valign=top><font size=2>4</td><td> </td>
<td valign=top><font size=2>5</td><td> </td>
<td valign=top><font size=2>6</td><td> </td>
<td valign=top><font size=2>7</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Do</td><td> </td>
<td valign=top><font size=2>Re</td><td> </td>
<td valign=top><font size=2>Mi</td><td> </td>
<td valign=top><font size=2>Fa</td><td> </td>
<td valign=top><font size=2>So</td><td> </td>
<td valign=top><font size=2>La</td><td> </td>
<td valign=top><font size=2>Ti</td><td> </td>
</tr>
</tbody>
</table>
<br>
<table>
<caption><font size=3><b>Title 2</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Foo</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Bar</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FoobBar</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
<td valign=top><font size=2>FizzBuzz<br>
<br>
</td><td> </td>
</tr>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>FooBar<br>
<br>
</td><td> </td>
</tr>
</tbody>
</table>
<br>
<table>
<caption><font size=3><b>Title 3</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Fizz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Buzz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FizzBuzz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>FooBar</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>FooBar</td><td> </td>
<td valign=top><font size=2>FizzBuzz</td><td> </td>
</tr>
</tbody>
</table>
<br>
<p>
<b><font size=4>Something</font>
</b><br>
</p>
<table>
<caption><font size=3><b>Title 4</b></font></caption>
<thead>
<th align=left valign=bottom><font size=2><u>Foo</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Bar</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Fizz</u></font></th><th> </th>
<th align=left valign=bottom><font size=2><u>Buzz</u></font></th><th> </th>
</thead>
<tbody>
<tr>
<td valign=top><font size=2>Foo</td><td> </td>
<td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
</tr>
<tr>
<b><td valign=top><font size=2>Foo</td><td> </td>
</b><td valign=top><font size=2>Bar</td><td> </td>
<td valign=top><font size=2>Fizz</td><td> </td>
<td valign=top><font size=2>Buzz</td><td> </td>
</tr>
</tbody>
</table>
<br>
</center>
</body>
</html>
Output
html excel powershell
html excel powershell
edited Aug 29 '16 at 14:32
janos
96.5k12122349
96.5k12122349
asked Aug 26 '16 at 2:23
Charles Clayton
202212
202212
add a comment |
add a comment |
2 Answers
2
active
oldest
votes
up vote
6
down vote
Your code looks fine. Just some notes:
Excel
The main weakness of your design as I see it is its reliance on the Excel application. Since you say robustness is important, Excel would be the main weak point. Microsoft says don't use Office as a server. From my own bitter experience, I think they are right to say that.
It's not so bad in your case because you are doing the same thing over and over, so there should be no surprises, but you never know. Excel can be flakey. It can pop up dialogs that will halt processing. It's especially flakey if you are processing multiple documents simultaneously because Excel was never designed to handle that case.
As an alternative, you could use a library to produce your Excel, such as EPPlus. I've never used that, so I can't endorse it, but it looks good. There may be others out there too. Remember that it's easy to use .Net libraries from PowerShell.
HTML
Your HTML is very nearly XHTML. It looks like just the <br>
tags would need to be fixed. If you have control over the thing that produces the HTML, then you could change it to produce XHTML instead, and then you could consume that with Import-XML
. I'm guessing that's not the case, but I thought it worth mentioning.
Error Handling
You don't have any error handling. If you want your code to be robust you will have to think about what it should do when things go wrong.
Write-Host
Write-Host "Formating Excel file"
If you want to capture these messages in a log, then it would be better to use Write-Output
.
Line Continuation
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
I don't believe you need to use the backtick character there.
-Include
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
One oddity of PowerShell is that -Filter
is much faster than -Include
. If the number of folders and files you are searching is small, then it doesn't matter, but if you are searching through lots of them, it can make a huge difference.
(What is the difference in functionality between the two? None, except that -Include
is more general and will work in non-filesystems, but that isn't relevant here.)
There is noImport-Xml
. The only close alternative itImport-CliXml
, and that doesn't work like you'd expect. Instead you have to do something like this:[xml]$XmlDocument = Get-Content -Path C:Cars.xml
after which u can use it like objects. See here.
– Adamar
Aug 31 '16 at 0:07
1
@Adamar, yes, sorry, that's what I meant. XML is a native datatype in PowerShell. Once you cast to an [xml] object, you can pipe that throughSelect-Xml
and use XPath expressions to extract the data you want. My point was simply that working with XML may be more straightforward than working with HTML.
– Dangph
Aug 31 '16 at 12:40
add a comment |
up vote
1
down vote
There is a much simpler approach. Excel has native HTML conversion capabilities that work fine. Using the Microsoft.Office.Interop.Excel in powershell:
function SaveAs-Excel2010{
param([string] $infile, [string] $outfile, [string] $tabname = $null)
$Excel = New-Object -Com Excel.Application
$Excel.DisplayAlerts = $false
$Workbook = $Excel.Workbooks.Open($infile)
if ($tabname) {
$Sheet = $Workbook.Worksheets.Item(1)
$Sheet.Name = $tabname;
}
$Workbook.SaveAs($outfile, $xlFixedFormat)
$Workbook.Close()
$Excel.quit()
}
#sample call
SaveAs-Excel2010 $htmlFile $xlsxFile $myTabName
New contributor
add a comment |
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
6
down vote
Your code looks fine. Just some notes:
Excel
The main weakness of your design as I see it is its reliance on the Excel application. Since you say robustness is important, Excel would be the main weak point. Microsoft says don't use Office as a server. From my own bitter experience, I think they are right to say that.
It's not so bad in your case because you are doing the same thing over and over, so there should be no surprises, but you never know. Excel can be flakey. It can pop up dialogs that will halt processing. It's especially flakey if you are processing multiple documents simultaneously because Excel was never designed to handle that case.
As an alternative, you could use a library to produce your Excel, such as EPPlus. I've never used that, so I can't endorse it, but it looks good. There may be others out there too. Remember that it's easy to use .Net libraries from PowerShell.
HTML
Your HTML is very nearly XHTML. It looks like just the <br>
tags would need to be fixed. If you have control over the thing that produces the HTML, then you could change it to produce XHTML instead, and then you could consume that with Import-XML
. I'm guessing that's not the case, but I thought it worth mentioning.
Error Handling
You don't have any error handling. If you want your code to be robust you will have to think about what it should do when things go wrong.
Write-Host
Write-Host "Formating Excel file"
If you want to capture these messages in a log, then it would be better to use Write-Output
.
Line Continuation
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
I don't believe you need to use the backtick character there.
-Include
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
One oddity of PowerShell is that -Filter
is much faster than -Include
. If the number of folders and files you are searching is small, then it doesn't matter, but if you are searching through lots of them, it can make a huge difference.
(What is the difference in functionality between the two? None, except that -Include
is more general and will work in non-filesystems, but that isn't relevant here.)
There is noImport-Xml
. The only close alternative itImport-CliXml
, and that doesn't work like you'd expect. Instead you have to do something like this:[xml]$XmlDocument = Get-Content -Path C:Cars.xml
after which u can use it like objects. See here.
– Adamar
Aug 31 '16 at 0:07
1
@Adamar, yes, sorry, that's what I meant. XML is a native datatype in PowerShell. Once you cast to an [xml] object, you can pipe that throughSelect-Xml
and use XPath expressions to extract the data you want. My point was simply that working with XML may be more straightforward than working with HTML.
– Dangph
Aug 31 '16 at 12:40
add a comment |
up vote
6
down vote
Your code looks fine. Just some notes:
Excel
The main weakness of your design as I see it is its reliance on the Excel application. Since you say robustness is important, Excel would be the main weak point. Microsoft says don't use Office as a server. From my own bitter experience, I think they are right to say that.
It's not so bad in your case because you are doing the same thing over and over, so there should be no surprises, but you never know. Excel can be flakey. It can pop up dialogs that will halt processing. It's especially flakey if you are processing multiple documents simultaneously because Excel was never designed to handle that case.
As an alternative, you could use a library to produce your Excel, such as EPPlus. I've never used that, so I can't endorse it, but it looks good. There may be others out there too. Remember that it's easy to use .Net libraries from PowerShell.
HTML
Your HTML is very nearly XHTML. It looks like just the <br>
tags would need to be fixed. If you have control over the thing that produces the HTML, then you could change it to produce XHTML instead, and then you could consume that with Import-XML
. I'm guessing that's not the case, but I thought it worth mentioning.
Error Handling
You don't have any error handling. If you want your code to be robust you will have to think about what it should do when things go wrong.
Write-Host
Write-Host "Formating Excel file"
If you want to capture these messages in a log, then it would be better to use Write-Output
.
Line Continuation
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
I don't believe you need to use the backtick character there.
-Include
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
One oddity of PowerShell is that -Filter
is much faster than -Include
. If the number of folders and files you are searching is small, then it doesn't matter, but if you are searching through lots of them, it can make a huge difference.
(What is the difference in functionality between the two? None, except that -Include
is more general and will work in non-filesystems, but that isn't relevant here.)
There is noImport-Xml
. The only close alternative itImport-CliXml
, and that doesn't work like you'd expect. Instead you have to do something like this:[xml]$XmlDocument = Get-Content -Path C:Cars.xml
after which u can use it like objects. See here.
– Adamar
Aug 31 '16 at 0:07
1
@Adamar, yes, sorry, that's what I meant. XML is a native datatype in PowerShell. Once you cast to an [xml] object, you can pipe that throughSelect-Xml
and use XPath expressions to extract the data you want. My point was simply that working with XML may be more straightforward than working with HTML.
– Dangph
Aug 31 '16 at 12:40
add a comment |
up vote
6
down vote
up vote
6
down vote
Your code looks fine. Just some notes:
Excel
The main weakness of your design as I see it is its reliance on the Excel application. Since you say robustness is important, Excel would be the main weak point. Microsoft says don't use Office as a server. From my own bitter experience, I think they are right to say that.
It's not so bad in your case because you are doing the same thing over and over, so there should be no surprises, but you never know. Excel can be flakey. It can pop up dialogs that will halt processing. It's especially flakey if you are processing multiple documents simultaneously because Excel was never designed to handle that case.
As an alternative, you could use a library to produce your Excel, such as EPPlus. I've never used that, so I can't endorse it, but it looks good. There may be others out there too. Remember that it's easy to use .Net libraries from PowerShell.
HTML
Your HTML is very nearly XHTML. It looks like just the <br>
tags would need to be fixed. If you have control over the thing that produces the HTML, then you could change it to produce XHTML instead, and then you could consume that with Import-XML
. I'm guessing that's not the case, but I thought it worth mentioning.
Error Handling
You don't have any error handling. If you want your code to be robust you will have to think about what it should do when things go wrong.
Write-Host
Write-Host "Formating Excel file"
If you want to capture these messages in a log, then it would be better to use Write-Output
.
Line Continuation
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
I don't believe you need to use the backtick character there.
-Include
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
One oddity of PowerShell is that -Filter
is much faster than -Include
. If the number of folders and files you are searching is small, then it doesn't matter, but if you are searching through lots of them, it can make a huge difference.
(What is the difference in functionality between the two? None, except that -Include
is more general and will work in non-filesystems, but that isn't relevant here.)
Your code looks fine. Just some notes:
Excel
The main weakness of your design as I see it is its reliance on the Excel application. Since you say robustness is important, Excel would be the main weak point. Microsoft says don't use Office as a server. From my own bitter experience, I think they are right to say that.
It's not so bad in your case because you are doing the same thing over and over, so there should be no surprises, but you never know. Excel can be flakey. It can pop up dialogs that will halt processing. It's especially flakey if you are processing multiple documents simultaneously because Excel was never designed to handle that case.
As an alternative, you could use a library to produce your Excel, such as EPPlus. I've never used that, so I can't endorse it, but it looks good. There may be others out there too. Remember that it's easy to use .Net libraries from PowerShell.
HTML
Your HTML is very nearly XHTML. It looks like just the <br>
tags would need to be fixed. If you have control over the thing that produces the HTML, then you could change it to produce XHTML instead, and then you could consume that with Import-XML
. I'm guessing that's not the case, but I thought it worth mentioning.
Error Handling
You don't have any error handling. If you want your code to be robust you will have to think about what it should do when things go wrong.
Write-Host
Write-Host "Formating Excel file"
If you want to capture these messages in a log, then it would be better to use Write-Output
.
Line Continuation
$tableRange = $sheet.Range($sheet.Cells($startRow, 1), `
$sheet.Cells($startRow + $numberOfRows - 1, $endColumn))
I don't believe you need to use the backtick character there.
-Include
$htmlFiles = Get-ChildItem -path $Directory -Recurse -Include index.html
One oddity of PowerShell is that -Filter
is much faster than -Include
. If the number of folders and files you are searching is small, then it doesn't matter, but if you are searching through lots of them, it can make a huge difference.
(What is the difference in functionality between the two? None, except that -Include
is more general and will work in non-filesystems, but that isn't relevant here.)
answered Aug 27 '16 at 4:01
Dangph
1,523510
1,523510
There is noImport-Xml
. The only close alternative itImport-CliXml
, and that doesn't work like you'd expect. Instead you have to do something like this:[xml]$XmlDocument = Get-Content -Path C:Cars.xml
after which u can use it like objects. See here.
– Adamar
Aug 31 '16 at 0:07
1
@Adamar, yes, sorry, that's what I meant. XML is a native datatype in PowerShell. Once you cast to an [xml] object, you can pipe that throughSelect-Xml
and use XPath expressions to extract the data you want. My point was simply that working with XML may be more straightforward than working with HTML.
– Dangph
Aug 31 '16 at 12:40
add a comment |
There is noImport-Xml
. The only close alternative itImport-CliXml
, and that doesn't work like you'd expect. Instead you have to do something like this:[xml]$XmlDocument = Get-Content -Path C:Cars.xml
after which u can use it like objects. See here.
– Adamar
Aug 31 '16 at 0:07
1
@Adamar, yes, sorry, that's what I meant. XML is a native datatype in PowerShell. Once you cast to an [xml] object, you can pipe that throughSelect-Xml
and use XPath expressions to extract the data you want. My point was simply that working with XML may be more straightforward than working with HTML.
– Dangph
Aug 31 '16 at 12:40
There is no
Import-Xml
. The only close alternative it Import-CliXml
, and that doesn't work like you'd expect. Instead you have to do something like this: [xml]$XmlDocument = Get-Content -Path C:Cars.xml
after which u can use it like objects. See here.– Adamar
Aug 31 '16 at 0:07
There is no
Import-Xml
. The only close alternative it Import-CliXml
, and that doesn't work like you'd expect. Instead you have to do something like this: [xml]$XmlDocument = Get-Content -Path C:Cars.xml
after which u can use it like objects. See here.– Adamar
Aug 31 '16 at 0:07
1
1
@Adamar, yes, sorry, that's what I meant. XML is a native datatype in PowerShell. Once you cast to an [xml] object, you can pipe that through
Select-Xml
and use XPath expressions to extract the data you want. My point was simply that working with XML may be more straightforward than working with HTML.– Dangph
Aug 31 '16 at 12:40
@Adamar, yes, sorry, that's what I meant. XML is a native datatype in PowerShell. Once you cast to an [xml] object, you can pipe that through
Select-Xml
and use XPath expressions to extract the data you want. My point was simply that working with XML may be more straightforward than working with HTML.– Dangph
Aug 31 '16 at 12:40
add a comment |
up vote
1
down vote
There is a much simpler approach. Excel has native HTML conversion capabilities that work fine. Using the Microsoft.Office.Interop.Excel in powershell:
function SaveAs-Excel2010{
param([string] $infile, [string] $outfile, [string] $tabname = $null)
$Excel = New-Object -Com Excel.Application
$Excel.DisplayAlerts = $false
$Workbook = $Excel.Workbooks.Open($infile)
if ($tabname) {
$Sheet = $Workbook.Worksheets.Item(1)
$Sheet.Name = $tabname;
}
$Workbook.SaveAs($outfile, $xlFixedFormat)
$Workbook.Close()
$Excel.quit()
}
#sample call
SaveAs-Excel2010 $htmlFile $xlsxFile $myTabName
New contributor
add a comment |
up vote
1
down vote
There is a much simpler approach. Excel has native HTML conversion capabilities that work fine. Using the Microsoft.Office.Interop.Excel in powershell:
function SaveAs-Excel2010{
param([string] $infile, [string] $outfile, [string] $tabname = $null)
$Excel = New-Object -Com Excel.Application
$Excel.DisplayAlerts = $false
$Workbook = $Excel.Workbooks.Open($infile)
if ($tabname) {
$Sheet = $Workbook.Worksheets.Item(1)
$Sheet.Name = $tabname;
}
$Workbook.SaveAs($outfile, $xlFixedFormat)
$Workbook.Close()
$Excel.quit()
}
#sample call
SaveAs-Excel2010 $htmlFile $xlsxFile $myTabName
New contributor
add a comment |
up vote
1
down vote
up vote
1
down vote
There is a much simpler approach. Excel has native HTML conversion capabilities that work fine. Using the Microsoft.Office.Interop.Excel in powershell:
function SaveAs-Excel2010{
param([string] $infile, [string] $outfile, [string] $tabname = $null)
$Excel = New-Object -Com Excel.Application
$Excel.DisplayAlerts = $false
$Workbook = $Excel.Workbooks.Open($infile)
if ($tabname) {
$Sheet = $Workbook.Worksheets.Item(1)
$Sheet.Name = $tabname;
}
$Workbook.SaveAs($outfile, $xlFixedFormat)
$Workbook.Close()
$Excel.quit()
}
#sample call
SaveAs-Excel2010 $htmlFile $xlsxFile $myTabName
New contributor
There is a much simpler approach. Excel has native HTML conversion capabilities that work fine. Using the Microsoft.Office.Interop.Excel in powershell:
function SaveAs-Excel2010{
param([string] $infile, [string] $outfile, [string] $tabname = $null)
$Excel = New-Object -Com Excel.Application
$Excel.DisplayAlerts = $false
$Workbook = $Excel.Workbooks.Open($infile)
if ($tabname) {
$Sheet = $Workbook.Worksheets.Item(1)
$Sheet.Name = $tabname;
}
$Workbook.SaveAs($outfile, $xlFixedFormat)
$Workbook.Close()
$Excel.quit()
}
#sample call
SaveAs-Excel2010 $htmlFile $xlsxFile $myTabName
New contributor
edited Nov 16 at 17:49
Sᴀᴍ Onᴇᴌᴀ
7,71061748
7,71061748
New contributor
answered Nov 16 at 17:20
Chris B
111
111
New contributor
New contributor
add a comment |
add a comment |
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f139670%2fpowershell-script-to-create-excel-spreadsheet-from-html-file%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown