The following PHP script will quickly find all the links on a web page, removing any duplicates and also giving the total count for any duplicate links found.
$html = file_get_contents('https://www.example.com/');
$dom = new DOMDocument();
/* Sending any errors to the void */
@$dom->loadHTML($html);
/* grab all the href urls on the page */
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
$urls = array();
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
/* Do not keep duplicates, also we can get a
* count of the duplicate urls if needed.
*/
if(array_key_exists($url, $urls)) {
$urls[$url] = $urls[$url] + 1;
} else {
$urls[$url] = 1;
}
}
/* Print all the urls */
foreach($urls as $url => $count) {
echo $url.PHP_EOL;
}
You can print the url count if needed for duplicated urls.
/* Print all the urls */
foreach($urls as $url => $count) {
echo $url. " - ". $count . PHP_EOL;
}