This tool goes through a sitemap of a website, and returns all unique email addresses it finds on any page. For this tool to work, it neesd to be given a direct link to the sitemap e.g. https://www.example.com/sitemap.xml. It will loop through all nested sitemaps also. It will find some false positives, and take a long time to run for large sites. To use it, download the script and place it in a folder. When it runs, it creates files called email-addresses.csv and discovered-pages.txt in the same folder it is located.
function ProcessSitemap($sitemapUrl) {
$regexPattern = "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
$logFile = "discovered-pages.txt"
$emailFile = "email-addresses.csv"
$response = Invoke-WebRequest $sitemapUrl -UseBasicParsing
$sitemap = [xml]$response.Content
$nsMgr = New-Object System.Xml.XmlNamespaceManager -ArgumentList $sitemap.NameTable
$nsMgr.AddNamespace("ns", "http://www.sitemaps.org/schemas/sitemap/0.9")
$pageUrls = $sitemap.SelectNodes("//ns:loc", $nsMgr) | Select-Object -ExpandProperty "#text"
Write-Output "Found $($pageUrls.Count) URLs in the sitemap."
$emailHash = @{}
foreach($pageUrl in $pageUrls) {
if($pageUrl.EndsWith(".xml")) {
if([Uri]::IsWellFormedUriString($pageUrl, [UriKind]::Absolute)) {
Write-Output "Processing absolute nested sitemap $pageUrl..."
ProcessSitemap($pageUrl)
}
else {
$nestedSitemapUrl = [System.Uri]::new([System.Uri]::new($sitemapUrl), $pageUrl).AbsoluteUri
Write-Output "Processing relative nested sitemap $nestedSitemapUrl..."
ProcessSitemap($nestedSitemapUrl)
}
}
else {
Write-Output "Processing page $pageUrl..."
$pageResponse = Invoke-WebRequest $pageUrl -UseBasicParsing
$emailMatches = [regex]::Matches($pageResponse.Content, $regexPattern)
foreach($emailMatch in $emailMatches) {
$email = $emailMatch.Value.Trim()
if(-not $emailHash.ContainsKey($email)) {
Write-Output $email
$emailHash.Add($email, $true)
}
}
Add-Content -Path $logFile -Value $pageUrl
}
}
$emailArray = $emailHash.Keys | Select-Object @{Name='Email';Expression={$_}}
$emailArray | Export-Csv $emailFile -NoTypeInformation -Append
Write-Output "All unique email addresses found have been exported to $emailFile."
}
$sitemapUrl = Read-Host "Enter the sitemap URL to scrape"
ProcessSitemap($sitemapUrl)