1

I am reading several medium sized excel files, approximately 50 columns x 500 rows. The problem is that some formatting is dragged until column XFD, that is column number 16384=2^16/4 if my math is correct. With OleDb this does not cause any problems as the following query let me select only a subset of the total spreadsheet without huge performance loss caused by the remaining columns

SELECT * FROM [SheetNameA1:BC500]

This takes around 3 seconds. The problem with OleDb is that it requires windows and a file at a disk, both of these causes some trouble with our cloud infrastructure and we would like to use OpenXml instead. OpenXml can be used with DOM-approach or SAX-approach. The first is a show stopper as a call to Worksheet getter at WorksheetPart loads the whole xml with all columns taking around 10 seconds.

Using the SAX approach to navigate the XML gives me the 5 methods for navigating a OpenXmlReader: LoadCurrentElement, Read, ReadFirstChild, ReadNextSibling and Skip. Using these I can:

  • use Read until I am hitting the Row elements
  • use ReadFirstChild to hit first Cell element and ReadNextSibling to read remaining and load them using LoadCurrentElement until column BC
  • use ReadNextSibling until the whole Row is read (ignoring content, i.e. no call to LoadCurrentElement)

The performance loss is in the last step. How can I make the reader jump to the next row without looping through all the cells.

I think the key might be to use Skip to loop over all children. The problem is that I need to be at Row-element to skip all Cell elements and there is no way to "rewind".

Here is an example I made to illustrate the problem. The excel file is simply marked with x in the range A1:XFD500. And here are the messures from while-time and load-time:

enter image description here

using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
using System.Diagnostics;
using System.Text.RegularExpressions;

using (var file = File.Open("testfile.xlsx", FileMode.Open, FileAccess.Read))
{
    var doc = SpreadsheetDocument.Open(file, false);

    var workbookPart = doc.WorkbookPart;
    var sheet = doc
        .WorkbookPart
        .Workbook
        .Descendants<Sheet>()
        .First(s => s.Name == "sheetName");

    var worksheetPart = (WorksheetPart)doc.WorkbookPart.GetPartById(sheet.Id);

    OpenXmlReader reader = OpenXmlReader.Create(worksheetPart);
    Dictionary<int, string> sharedStringCache = new Dictionary<int, string>();
    var rows = new List<List<object>>();

    int i = 0;
    foreach (var el in workbookPart.SharedStringTablePart.SharedStringTable.ChildElements)
    {
        sharedStringCache.Add(i++, el.InnerText);
    }

    TimeSpan whileTime = TimeSpan.Zero;
    TimeSpan loadTime = TimeSpan.Zero;
    var stopwatch1 = new Stopwatch();
    var stopwatch2 = new Stopwatch();

    int lastColumnWithData = 50;
    while (reader.Read())
    {
        if (reader.ElementType == typeof(Row))
        {
            reader.ReadFirstChild();
            List<object> cells = new List<object>();
            do
            {
                if (reader.ElementType == typeof(Cell))
                {
                    stopwatch2.Restart();
                    Cell c = (Cell)reader.LoadCurrentElement();
                    loadTime += stopwatch2.Elapsed;

                    var columnLetters = Regex.Replace(c.CellReference, @"[\d]", string.Empty).ToUpper();
                    var columnIndex = NumberFromExcelColumn(columnLetters);
                    var rowIndex = int.Parse(Regex.Replace(c.CellReference, @"[^\d]", string.Empty).ToUpper());

                    if (columnIndex > lastColumnWithData)
                    {
                        stopwatch1.Restart();
                        while (reader.ReadNextSibling()) {}
                        whileTime += stopwatch1.Elapsed;
                        break;
                    }

                    object value;

                    switch (c.DataType?.Value)
                    {
                        case CellValues.Boolean:
                            value = bool.Parse(c.CellValue.InnerText);
                            break;
                        case CellValues.Date:
                            value = DateTime.Parse(c.CellValue.InnerText);
                            break;
                        case CellValues.Number:
                            value = double.Parse(c.CellValue.InnerText);
                            break;
                        case CellValues.InlineString:
                        case CellValues.String:
                            value = c.CellValue.InnerText;
                            break;
                        case CellValues.SharedString:
                            value = sharedStringCache[int.Parse(c.CellValue.InnerText)];
                            break;
                        default:
                            value = c.CellValue.InnerText;
                            break;
                    }

                    if (value != null)
                        cells.Add(value);
                }

            } while (reader.ReadNextSibling());

            if (cells.Any())
                rows.Add(cells);
        }
    }
}

static int NumberFromExcelColumn(string column)
{
    int retVal = 0;
    string col = column.ToUpper();
    for (int iChar = col.Length - 1; iChar >= 0; iChar--)
    {
        char colPiece = col[iChar];
        int colNum = colPiece - 64;
        retVal = retVal + colNum * (int)Math.Pow(26, col.Length - (iChar + 1));
    }
    return retVal;
}

made using examples from:

How to open a huge excel file efficiently

Fastest function to generate Excel column letters in C#

toftis
  • 1,051
  • 9
  • 26

0 Answers0