0

I am using cheerio to parse HTML code in different nodes. I can easily do $("*"), but this gets me only normal HTML nodes and not the separate text nodes. Lets consider 3 user inputs:

One:

text only

I need: single text node.

Two:

<div>
  text 1
  <div>
    inner text
  </div>
  text 2
</div>

I need: text node + div node + text node in same sequence.

Three:

<div>
  <div>
    inner text 1
    <div>
      inner text 2
    </div>
  </div>
  <div>
    inner text 3
  </div>
</div>

I need: 2 div nodes

Possible?

Rehmat
  • 1,807
  • 2
  • 20
  • 25

2 Answers2

1

In hope to help someone, filter function seems to return text nodes also. I got help from this answer: https://stackoverflow.com/a/6520267/3800042

var $ = cheerio.load(tree);
var iterate = function(node, level) {
  if (typeof level === "undefined") level = "--";
  var list = $(node).contents().filter(function() { return true; });
  for (var i=0; i<=list.length-1; i++) {
    var item = list[i];
    console.log(level, "(" + i + ")", item.type, $(item).text());
    iterate(item, level + "--");
  }
}
iterate($.root());

HTML input

<div>
  text 1
  <div>
    inner text
  </div>
  text 2
</div>

Result

-- (0) tag 

  text 1



    inner text



  text 2


---- (0) text 

  text 1



---- (1) tag 

    inner text



------ (0) text 

    inner text



---- (2) text 

  text 2
Rehmat
  • 1,807
  • 2
  • 20
  • 25
  • "`filter` function seems to return text nodes also." - This is NOT true. The filter line in your code does nothing: `.filter(function() { return true; });`. You need to filter out non-text types: `.filter(function() { return this.nodeType == Node.TEXT_NODE; });` – toinetoine Jan 08 '21 at 21:49
0

I hope the following codes can help you.

const cheerio = require("cheerio");
const htmlText = `<ul id="fruits">
  <!--This is a comment.-->
  <li class="apple">Apple</li>
  Peach
  <li class="orange">Orange</li>
  <li class="pear">Pear</li>
</ul>`;

const $ = cheerio.load(htmlText);
const contents = $('ul#fruits').contents();
console.log(contents.length);// 9, since nodes like '\n' are included 
console.log(new RegExp('^\\s*$').test('\n '));
function isWhitespaceTextNode(node){
    if(node.type !== 'text'){
        return false;
    }
    if(new RegExp('^\\s*$').test(node.data)){
        return true;
    }
    return false;
}
//Note here: filter is a function provided by cheerio, not Array.filter
const nonWhitespaceTextContents = contents.filter(nodeIndex=>{
    const node = contents[nodeIndex];
    if(isWhitespaceTextNode(node)){
        return false;
    }else{
        return true;
    }
});
console.log(nonWhitespaceTextContents.length);// 5, since nodes like '\n ' are excluded
nonWhitespaceTextContents.each((_, node)=>console.log(node));
//[comment node]
//[li node] apple
//[text node] peach
//[li node] orange
//[li node] pear
TTY112358
  • 114
  • 6