95

I have a chrome extension. I need to analyse from the HTML source of the current page. I found here all kinds of solutions with background pages and content scripts but none helped me. here is what I have so far:

manifest.json

{
  "name": "Extension",
  "version": "1.0",
  "description": "Extension",
  "browser_action": {
    "default_icon": "bmarkred.ico",
    "popup": "Test.html"
  },
  "content_scripts": [
    {
      "matches": ["http://*/*"],
      "js": ["content.js"]
    }
  ],
  "background": {
    "page": "backgroundPage.html"
  },
  "permissions": [
    "cookies",
    "tabs",
    "http://*/*", 
    "https://*/*"
  ]
}

background.html

<html>
<head>
<script type="text/javascript">
    try {
        chrome.tabs.getSelected(null, function (tab) {
            chrome.tabs.sendRequest(tab.id, {action: "getSource"}, function(source) {
                alert(source);
            });
        });
    }
    catch (ex) {
        alert(ex);
    }
</script>
</head>
</html>

content.js

chrome.extension.onRequest.addListener(function(request, sender, callback) {
    if (request.action == "getSource") {
        callback(document.getElementsByTagName('html')[0].innerHTML);
    }
});

The alert always alerts undefined. even if i change in the content.js file the callback function to:

callback('hello'); 

still the same result. What am I doing wrong? maybe I'm going at this the wrong way. What I really need is this: When the user opens the extension popup (and only then), I need HTML of the current page so I can analyse it.

Ahmed Ashour
  • 4,462
  • 10
  • 33
  • 49
Mr T.
  • 4,000
  • 8
  • 39
  • 58
  • A problem is that the code in your background page is run immediately (before the content scripts are injected). A very similar/duplicate question has been asked before; Have a look at the answer at [Open a new Google Chrome tab and get the source](http://stackoverflow.com/a/10162291/938089?open-a-new-google-chrome-tab-and-get-the-source). – Rob W Jul 27 '12 at 10:06
  • Thanks for your reply rob. i copied the code segments in your attached link but it still doesn't work. The problem is that my extension is a popup and i need to get the HTML only when the user opens my extension. for example, if the current tab is facebook.com then only when i open my extension, i will retrieve the html source to my js file (not the content script or the background page). – Mr T. Jul 27 '12 at 17:50
  • Update your question with your current code. The code has to contain comments which highlight the problem. – Rob W Jul 27 '12 at 19:46

2 Answers2

165

Inject a script into the page you want to get the source from and message it back to the popup....

manifest.json

{
  "name": "Get pages source",
  "version": "1.0",
  "manifest_version": 2,
  "description": "Get pages source from a popup",
  "browser_action": {
    "default_icon": "icon.png",
    "default_popup": "popup.html"
  },
  "permissions": ["tabs", "<all_urls>"]
}

popup.html

<!DOCTYPE html>
<html style=''>
<head>
<script src='popup.js'></script>
</head>
<body style="width:400px;">
<div id='message'>Injecting Script....</div>
</body>
</html>

popup.js

chrome.runtime.onMessage.addListener(function(request, sender) {
  if (request.action == "getSource") {
    message.innerText = request.source;
  }
});

function onWindowLoad() {

  var message = document.querySelector('#message');

  chrome.tabs.executeScript(null, {
    file: "getPagesSource.js"
  }, function() {
    // If you try and inject into an extensions page or the webstore/NTP you'll get an error
    if (chrome.runtime.lastError) {
      message.innerText = 'There was an error injecting script : \n' + chrome.runtime.lastError.message;
    }
  });

}

window.onload = onWindowLoad;

getPagesSource.js

// @author Rob W <http://stackoverflow.com/users/938089/rob-w>
// Demo: var serialized_html = DOMtoString(document);

function DOMtoString(document_root) {
    var html = '',
        node = document_root.firstChild;
    while (node) {
        switch (node.nodeType) {
        case Node.ELEMENT_NODE:
            html += node.outerHTML;
            break;
        case Node.TEXT_NODE:
            html += node.nodeValue;
            break;
        case Node.CDATA_SECTION_NODE:
            html += '<![CDATA[' + node.nodeValue + ']]>';
            break;
        case Node.COMMENT_NODE:
            html += '<!--' + node.nodeValue + '-->';
            break;
        case Node.DOCUMENT_TYPE_NODE:
            // (X)HTML documents are identified by public identifiers
            html += "<!DOCTYPE " + node.name + (node.publicId ? ' PUBLIC "' + node.publicId + '"' : '') + (!node.publicId && node.systemId ? ' SYSTEM' : '') + (node.systemId ? ' "' + node.systemId + '"' : '') + '>\n';
            break;
        }
        node = node.nextSibling;
    }
    return html;
}

chrome.runtime.sendMessage({
    action: "getSource",
    source: DOMtoString(document)
});
Xan
  • 71,217
  • 14
  • 165
  • 189
PAEz
  • 8,166
  • 2
  • 33
  • 26
  • @Gil Tankus So sorry for my first post, didn't pay enough attention to the comments (again) and ended up just regurgitating what Rob W said. The new post should have what you wanted. – PAEz Jul 28 '12 at 12:38
  • Thanks, your answer was really helpful, my problem is that the on onMessage happens asynchronous. in my popup, i have all sorts of other stuff that relay on the source HTML. how can i save the source in a global var and only then continue with the page onload function? – Mr T. Jul 28 '12 at 18:47
  • I don't think you can. Your either going to have to put it in the callbacks code or in a function and call that in the callback...if only JS had a `goto` command aye? ;P – PAEz Jul 28 '12 at 18:57
  • 25
    Why not just something like document.documentElement.outerHTML instead of the DOMtoString function? – djfm Jan 03 '15 at 11:24
  • @djfm That would be fine pretty much all of the time. Its just that from what I could tell Rob W's function is more complete...returns the doctype for instance that your solution doesn't, yours is only getting the html part. – PAEz Jan 03 '15 at 18:54
  • I want to use document.getElementByTagName in popup.js where document is document object of the current tab. I couldn't do it. I changed `DOMtoString(document)` to `document` and tried to use it as `request.source.getElementByTagName` but things didn't work that well. Any comment? – Emre Aydin Sep 08 '15 at 02:17
  • This answer works by *re-generating* the page source from the DOM (which could be altered by other scripts at runtime), it doesn't return the raw HTML response from the server. – Dai Jun 18 '16 at 01:05
  • The must bizare thing it's i wanted to have another ID then message like #getsource but the document.querySelector('#gethtml') do nothing. i just put gethtml.innerText without document.querySelector() Can you explain that? – Gino Nov 23 '16 at 00:31
  • So was it meant to be getsource or gethtml?...maybe that was it. Theres a thing where you can access an element by just using its id in js....Ive never seen anyone use it. I wouldnt, could see it being a big source of confusion, especially as its not known by everyone. Heres a simple example.... http://jsbin.com/jedotoxuqa/edit?html,css,js,output – PAEz Dec 04 '16 at 04:35
  • Can someone please tell how two parse that message.innerText so that i can do some calculation with the table values returned from a custom application page – Share_Improve Dec 12 '16 at 09:30
  • @Share_Improve [DOM Parser](https://developer.mozilla.org/en/docs/Web/API/DOMParser) is good for that. – PAEz Dec 12 '16 at 13:43
  • how can I `console.log` the `message.innerText` when updated, I'm familiar with `"background": { "scripts": ["background.js"], "persistent": false },`.. When I don't declare any background in *manifest* the *generated background page* doesn't appears, so I can't see the `console` or maybe it doesn't log. I tried editing `popup.js` the line after `message.innerText = request.source;`. **How can I solve this?** – Maifee Ul Asad Oct 05 '19 at 07:56
  • Thank you very much! This saved my day! – Sudharsan Ravikumar Apr 15 '20 at 13:38
  • Thanks @PAEz, that saved my day too. – Siva Sankar Nov 17 '21 at 18:38
  • Could someone update this to work with manifest v3 please? I have tried changing chrome.tabs.executeScript to chrome.scripting.executeScript and adding scripting to permissions, but there must be more. Thank you. – Døner M. Apr 06 '22 at 13:54
  • chrome.runtime.sendMessage needs an extension id as the first parameter – Dr. Freddy Dimethyltryptamine May 21 '22 at 10:46
1

Here is my solution:

chrome.runtime.onMessage.addListener(function(request, sender) {
        if (request.action == "getSource") {
            this.pageSource = request.source;
            var title = this.pageSource.match(/<title[^>]*>([^<]+)<\/title>/)[1];
            alert(title)
        }
    });

    chrome.tabs.query({ active: true, currentWindow: true }, tabs => {
        chrome.tabs.executeScript(
            tabs[0].id,
            { code: 'var s = document.documentElement.outerHTML; chrome.runtime.sendMessage({action: "getSource", source: s});' }
        );
    });
Dejan S
  • 725
  • 5
  • 12
  • Have you tried this method (and submit the app to Webstore) before? It's almost like a hack of using content_scripts without using it. – Hieu Nguyen Jun 13 '21 at 01:28