Today I was asked to write a routine that enhances cleaning of urls.
What I need to do is clean the url field so that the crawlers can crawl the website.
Ex. http://www.abc.com/welcome.html
http://www.msn.com/default/
should be converted to
http://www.abc.com
http://www.msn.com
What I need to do is clean the url field so that the crawlers can crawl the website.
Ex. http://www.abc.com/welcome.html
http://www.msn.com/default/
should be converted to
http://www.abc.com
http://www.msn.com
Code Snippet
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.Text.RegularExpressions;
- namespace ScrubWebUrl
- {
- class Program
- {
- static void Main(string[] args)
- {
- #region "Scrub url examples"
- List<string> weburls = new List<string>();
- weburls.Add(string.Empty);
- weburls.Add("");
- weburls.Add("http://www.yahoo.co.in/index.html");
- weburls.Add("http://www.yahoo.ca/index/main.asp");
- weburls.Add("http://blogger.yahoo.ca/index/main.aspx");
- weburls.Add("http://www.yahoo.ca/yahoo/main.jsp");
- weburls.Add("http://www.rediff.ca/mail/sell/ma.ashx");
- weburls.Add("http://3ww.janus.com/account/securelogin/");
- weburls.Add("http://2ww.xca/index/secure/main");
- weburls.Add("http://www.maquet.com");
- weburls.Add("http://ca.maquet.com");
- weburls.Add("http://www.datascope.com");
- weburls.Add("http://www.datascope.com/index.html");
- weburls.Add("http://www.rediff.com/abc/crap.html");
- weburls.Add("http://www.mnghardware.com");
- weburls.Add("http://anzaexotics.com/home");
- weburls.Add("http://www.empowercom.net");
- weburls.Add("http://www.cgulfc.com/home.asp");
- weburls.Add("http://www.chefrubber.com");
- weburls.Add("http://www.mathers-team.com");
- weburls.Add("http://www.2crave.com");
- weburls.Add("http://www.tmgwest.com");
- weburls.Add("http://www.next-communications.com");
- weburls.Add("http://www.nextcom.com");
- weburls.Add("http://www.fishertracks.com");
- weburls.Add("http://www.summitengineer.net");
- weburls.Add("http://www.cablofil.com");
- weburls.Add("http://safety.det-tronics.com");
- weburls.Add("http://www.detronics.com/utcfs/templates/pages/template-46/1,8060,pageid=2494&siteid=462,00.html");
- weburls.Add("http://www.detronics.com");
- weburls.Add("http://www.ixp.tz.net");
- weburls.Add("http://clev11.com/~composi1");
- weburls.Add("http://saint-joseph.michiganpages.org/c-224509.htm");
- weburls.Add("http://www.marriott.com/hotels/travel/atlrb-renaissance-atlanta-waverly-hotel");
- weburls.Add("http://www.chevron.com/about/our_businesses/mining.asp");
- weburls.Add("http://www.tria.com/sports_medicine_fellowship.aspx");
- weburls.Add("http://www.cgc-jp.com/products/finechemicals/index.html");
- weburls.Add("http://www.pollockpaper.com/packaging.asp");
- weburls.Add("http://alliedhightech.com/imaging");
- weburls.Add("http://www.as.ua.edu/english/03_graduate/maphd");
- weburls.Add("http://www.publicautoauctionassoc.org");
- weburls.Add("http://www.clubsafetysolutions.com");
- weburls.Add("http://www.groupe.e.ch");
- weburls.Add("http://www.distel.nl");
- weburls.Add("http://www.familydoctor.org/valleyhealthw");
- weburls.Add("http://www.importcostumes.com/pony+express+creations,+inc.html");
- weburls.Add("http://www.faseb.org/society-management-services/project-management-services.aspx");
- weburls.Add("http://www.mindware.it/masterpack");
- weburls.Add("http://www.water-softeners-filters.com");
- weburls.Add("http://www.aspengrovekitchenandbath.com");
- weburls.Add("http://www.stratatechcorp.com/products/stratatest.php");
- weburls.Add("http://www.tri3bar.com");
- weburls.Add("http://www.brownandsharpe.com/?utm_source=agma&utm_medium=listing&utm_campaign=gears");
- // weburls.Add("http://www.brownandsharpe.com www.hexagonmetrology.us");
- weburls.Add("http://www.hexagonmetrology.us/?utm_source=sae&utm_medium=directory_listing&utm_campaign=hexagon");
- weburls.Add("http://www.mobibon.com.tw");
- weburls.Add("http://www.pivotalhealthsolutions.com/default.aspx");
- weburls.Add("http://www.pivotalhealthsolutions.com/athletics");
- weburls.Add("http://www.aspengrovekitchenandbath.com");
- weburls.Add("http://www.stratatechcorp.com/products/stratatest.php");
- weburls.Add("http://www.tri3bar.com");
- weburls.Add("http://www.brownandsharpe.com/?utm_source=agma&utm_medium=listing&utm_campaign=gears");
- // weburls.Add("http://www.brownandsharpe.com www.hexagonmetrology.us");
- weburls.Add("http://www.hexagonmetrology.us/?utm_source=sae&utm_medium=directory_listing&utm_campaign=hexagon");
- weburls.Add("http://www.mobibon.com.tw");
- weburls.Add("http://www.pivotalhealthsolutions.com/default.aspx");
- weburls.Add("http://www.pivotalhealthsolutions.com/athletics");
- weburls.Add("http://www.faseb.org/society-management-services/project-management-services.aspx");
- weburls.Add("http://www.importcostumes.com/pony+express+creations,+inc.html");
- #endregion
- ScrubTheseUrls(weburls);
- }
- private static void ScrubTheseUrls(List<string> weburls)
- {
- Console.WriteLine("The input urls count is :" + weburls.Count);
- List<string> scrubbedUrls = new List<string>();
- foreach (string oldurl in weburls)
- {
- scrubbedUrls.Add(Scrbber(oldurl));
- }
- foreach (string newurl in scrubbedUrls)
- {
- Console.WriteLine(newurl);
- }
- Console.WriteLine("The scrubbed urls count is :" + scrubbedUrls.Count);
- Console.ReadKey();
- }
- private static string Scrbber(string oldurl)
- {
- string regexp = "http://*[^/]*";
- return Regex.Match(oldurl, regexp).Value;
- }
- }
- }
No comments:
Post a Comment