Email Scraper Tool

This tool goes through a sitemap of a website, and returns all unique email addresses it finds on any page. For this tool to work, it neesd to be given a direct link to the sitemap e.g. https://www.example.com/sitemap.xml. It will loop through all nested sitemaps also. It will find some false positives, and take a long time to run for large sites. To use it, download the script and place it in a folder. When it runs, it creates files called email-addresses.csv and discovered-pages.txt in the same folder it is located.


        function ProcessSitemap($sitemapUrl) {
            $regexPattern = "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
            $logFile = "discovered-pages.txt"
            $emailFile = "email-addresses.csv"
        
            $response = Invoke-WebRequest $sitemapUrl -UseBasicParsing
            $sitemap = [xml]$response.Content
        
            $nsMgr = New-Object System.Xml.XmlNamespaceManager -ArgumentList $sitemap.NameTable
            $nsMgr.AddNamespace("ns", "http://www.sitemaps.org/schemas/sitemap/0.9")
            $pageUrls = $sitemap.SelectNodes("//ns:loc", $nsMgr) | Select-Object -ExpandProperty "#text"
        
            Write-Output "Found $($pageUrls.Count) URLs in the sitemap."
        
            $emailHash = @{}
        
            foreach($pageUrl in $pageUrls) {
                if($pageUrl.EndsWith(".xml")) {
                    if([Uri]::IsWellFormedUriString($pageUrl, [UriKind]::Absolute)) {
                        Write-Output "Processing absolute nested sitemap $pageUrl..."
                        ProcessSitemap($pageUrl)
                    }
                    else {
                        $nestedSitemapUrl = [System.Uri]::new([System.Uri]::new($sitemapUrl), $pageUrl).AbsoluteUri
                        Write-Output "Processing relative nested sitemap $nestedSitemapUrl..."
                        ProcessSitemap($nestedSitemapUrl)
                    }
                }
                else {
                    Write-Output "Processing page $pageUrl..."
                    $pageResponse = Invoke-WebRequest $pageUrl -UseBasicParsing
                    $emailMatches = [regex]::Matches($pageResponse.Content, $regexPattern)
            
                    foreach($emailMatch in $emailMatches) {
                        $email = $emailMatch.Value.Trim()
                        if(-not $emailHash.ContainsKey($email)) {
                            Write-Output $email
                            $emailHash.Add($email, $true)
                        }
                    }
        
                    Add-Content -Path $logFile -Value $pageUrl
                }
            }
        
            $emailArray = $emailHash.Keys | Select-Object @{Name='Email';Expression={$_}}
            $emailArray | Export-Csv $emailFile -NoTypeInformation -Append
        
            Write-Output "All unique email addresses found have been exported to $emailFile."
        }
        
        $sitemapUrl = Read-Host "Enter the sitemap URL to scrape"
        ProcessSitemap($sitemapUrl)