I have a word file saved as an XML file and I wanted to extract all the information between <w:t> tags. This is the piece of code I am using.
//var parser = new DOMParser();
const fs = require('fs');
const { connect } = require('http2');
var format = require('xml-formatter');
try {
// read contents of the file
const data = fs.readFileSync('untitled.xml', 'UTF-8');
//var d = format(data);
// split the contents by new line
const lines = data.split(/\r?\n/);
// print all lines
lines.forEach((line) => {
if(/<w:t>.*:.*<\/w:t>/.test(line)){
console.log(line);
// var match = line.match(/<w:t>Comment:.*<\/w:t>/g);
// console.log(match);
// console.log('ASDRWSGQERGEAHGERAGERgfew');
}
});
} catch (err) {
console.error(err);
}
The problem is that as the tree structure of the XML is not preserved while I'm reading it the output tends to have a lot of unnecessary lines in it. When I copy-paste the same file from a browser and save it as an XML, the output is clean. Is there another way to read the XML to get a clean output? current garbage output