Click here to Skip to main content
16,016,643 members
Please Sign up or sign in to vote.
1.00/5 (1 vote)
See more:
I am currently using nodejs and and multiple modules such as fetch, HTMLparser, bodyParser to scrape the HTML off of a webpage entered into my app. It has worked for most of the HTML, but for some reason it seems to be missing certain nested html elements where as other nested html elements are displayed. I was hoping someone could help me solve this issue as I can't seem to find a solution.




JavaScript
const express = require("express");
const fetch = require("isomorphic-fetch");
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const HTMLparser = require("node-html-parser");
const bodyParser = require("body-parser");

const test = require("./htmlFormat.js");
const htmlFormat = require("./htmlFormat.js");
/* Functions */

function authenticateUrl(url) {
  let urlHttps = url.slice(0, 8);
  let testUrl = "https://";
  if (urlHttps !== testUrl) {
    url = testUrl + url;
  }
  console.log(url);
  return url;
}
//////////////////////////////////////////////
console.log("test");
test.testPrint();

const app = express();

app.use(
  bodyParser.urlencoded({
    extended: true,
  })
);
app.get("/", function (req, res) {
  res.sendFile(__dirname + "/public/index.html");
});
app.post("/", async function (req, res) {
  res.send("Thanks for posting that");
  console.log(req.body);
  const website = req.body.websiteOption;
  console.log(website);
  const authenticatedWebsite = authenticateUrl(website);
  console.log(authenticatedWebsite);

  try {
    const response = await fetch(authenticatedWebsite);
    // console.log(response)
    const text = await response.text();
    // console.log(text)
    const dom = await new JSDOM(text);
   
    const html = dom.window.document.getElementsByTagName('*')[0].innerHTML;
    console.log(html)
    let root = HTMLparser.parse(html);
    root = root.toString();

    htmlFormat.format(root);
  } catch (err) {
    console.log(err);
  }
});
app.use(express.static(__dirname + "/public"));

app.listen(3000, function () {
  console.log("Server started on port 3000");
});


What I have tried:

I have tried different parser modules but I get the same issue as before.
Posted
Comments
Richard MacCutchan 10-Oct-22 4:31am    
You have not specified what the problem is and where it occurs. The statement, "it seems to be missing certain nested html elements where as other nested html elements are displayed." does not give us any clues. Please use the Improve question link above, and add complete details of what is not working.
Richard Deeming 11-Oct-22 6:45am    
Don't forget that you're just fetching the raw HTML of the remote site. If it uses Javascript to create elements dynamically, those won't be available in your script.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900