Friday, April 13, 2012

Today I was asked to write a routine that enhances cleaning of urls.

What I need to do is clean the url field so that the crawlers can crawl the website.

Ex. http://www.abc.com/welcome.html
      http://www.msn.com/default/


should be converted to

http://www.abc.com
http://www.msn.com


Code Snippet
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6.  
  7. namespace ScrubWebUrl
  8. {
  9.     class Program
  10.     {
  11.         static void Main(string[] args)
  12.         {
  13.  
  14.             #region "Scrub url examples"
  15.             List<string> weburls = new List<string>();
  16.  
  17.             weburls.Add(string.Empty);
  18.             weburls.Add("");
  19.             weburls.Add("http://www.yahoo.co.in/index.html");
  20.             weburls.Add("http://www.yahoo.ca/index/main.asp");
  21.             weburls.Add("http://blogger.yahoo.ca/index/main.aspx");
  22.             weburls.Add("http://www.yahoo.ca/yahoo/main.jsp");
  23.             weburls.Add("http://www.rediff.ca/mail/sell/ma.ashx");
  24.             weburls.Add("http://3ww.janus.com/account/securelogin/");
  25.             weburls.Add("http://2ww.xca/index/secure/main");
  26.             weburls.Add("http://www.maquet.com");
  27.             weburls.Add("http://ca.maquet.com");
  28.             weburls.Add("http://www.datascope.com");
  29.             weburls.Add("http://www.datascope.com/index.html");
  30.             weburls.Add("http://www.rediff.com/abc/crap.html");
  31.             weburls.Add("http://www.mnghardware.com");
  32.             weburls.Add("http://anzaexotics.com/home");
  33.             weburls.Add("http://www.empowercom.net");
  34.             weburls.Add("http://www.cgulfc.com/home.asp");
  35.             weburls.Add("http://www.chefrubber.com");
  36.             weburls.Add("http://www.mathers-team.com");
  37.             weburls.Add("http://www.2crave.com");
  38.             weburls.Add("http://www.tmgwest.com");
  39.             weburls.Add("http://www.next-communications.com");
  40.             weburls.Add("http://www.nextcom.com");
  41.             weburls.Add("http://www.fishertracks.com");
  42.             weburls.Add("http://www.summitengineer.net");
  43.             weburls.Add("http://www.cablofil.com");
  44.             weburls.Add("http://safety.det-tronics.com");
  45.             weburls.Add("http://www.detronics.com/utcfs/templates/pages/template-46/1,8060,pageid=2494&siteid=462,00.html");
  46.             weburls.Add("http://www.detronics.com");
  47.             weburls.Add("http://www.ixp.tz.net");
  48.             weburls.Add("http://clev11.com/~composi1");
  49.             weburls.Add("http://saint-joseph.michiganpages.org/c-224509.htm");
  50.             weburls.Add("http://www.marriott.com/hotels/travel/atlrb-renaissance-atlanta-waverly-hotel");
  51.             weburls.Add("http://www.chevron.com/about/our_businesses/mining.asp");
  52.             weburls.Add("http://www.tria.com/sports_medicine_fellowship.aspx");
  53.             weburls.Add("http://www.cgc-jp.com/products/finechemicals/index.html");
  54.             weburls.Add("http://www.pollockpaper.com/packaging.asp");
  55.             weburls.Add("http://alliedhightech.com/imaging");
  56.             weburls.Add("http://www.as.ua.edu/english/03_graduate/maphd");
  57.             weburls.Add("http://www.publicautoauctionassoc.org");
  58.             weburls.Add("http://www.clubsafetysolutions.com");
  59.             weburls.Add("http://www.groupe.e.ch");
  60.             weburls.Add("http://www.distel.nl");
  61.             weburls.Add("http://www.familydoctor.org/valleyhealthw");
  62.             weburls.Add("http://www.importcostumes.com/pony+express+creations,+inc.html");
  63.             weburls.Add("http://www.faseb.org/society-management-services/project-management-services.aspx");
  64.             weburls.Add("http://www.mindware.it/masterpack");
  65.             weburls.Add("http://www.water-softeners-filters.com");
  66.             weburls.Add("http://www.aspengrovekitchenandbath.com");
  67.             weburls.Add("http://www.stratatechcorp.com/products/stratatest.php");
  68.             weburls.Add("http://www.tri3bar.com");
  69.             weburls.Add("http://www.brownandsharpe.com/?utm_source=agma&utm_medium=listing&utm_campaign=gears");
  70.             //    weburls.Add("http://www.brownandsharpe.com www.hexagonmetrology.us");
  71.             weburls.Add("http://www.hexagonmetrology.us/?utm_source=sae&utm_medium=directory_listing&utm_campaign=hexagon");
  72.             weburls.Add("http://www.mobibon.com.tw");
  73.             weburls.Add("http://www.pivotalhealthsolutions.com/default.aspx");
  74.             weburls.Add("http://www.pivotalhealthsolutions.com/athletics");
  75.             weburls.Add("http://www.aspengrovekitchenandbath.com");
  76.             weburls.Add("http://www.stratatechcorp.com/products/stratatest.php");
  77.             weburls.Add("http://www.tri3bar.com");
  78.             weburls.Add("http://www.brownandsharpe.com/?utm_source=agma&utm_medium=listing&utm_campaign=gears");
  79.             //    weburls.Add("http://www.brownandsharpe.com www.hexagonmetrology.us");
  80.             weburls.Add("http://www.hexagonmetrology.us/?utm_source=sae&utm_medium=directory_listing&utm_campaign=hexagon");
  81.             weburls.Add("http://www.mobibon.com.tw");
  82.             weburls.Add("http://www.pivotalhealthsolutions.com/default.aspx");
  83.             weburls.Add("http://www.pivotalhealthsolutions.com/athletics");
  84.             weburls.Add("http://www.faseb.org/society-management-services/project-management-services.aspx");
  85.             weburls.Add("http://www.importcostumes.com/pony+express+creations,+inc.html");
  86.             #endregion
  87.  
  88.             ScrubTheseUrls(weburls);
  89.  
  90.         }
  91.  
  92.         private static void ScrubTheseUrls(List<string> weburls)
  93.         {
  94.  
  95.             Console.WriteLine("The input urls count is :" + weburls.Count);
  96.  
  97.             List<string> scrubbedUrls = new List<string>();
  98.  
  99.             foreach (string oldurl in weburls)
  100.             {
  101.                 scrubbedUrls.Add(Scrbber(oldurl));
  102.             }
  103.  
  104.             foreach (string newurl in scrubbedUrls)
  105.             {
  106.                 Console.WriteLine(newurl);
  107.             }
  108.  
  109.             Console.WriteLine("The scrubbed urls count is :" + scrubbedUrls.Count);
  110.  
  111.             Console.ReadKey();
  112.  
  113.         }
  114.  
  115.         private static string Scrbber(string oldurl)
  116.         {
  117.  
  118.             string regexp = "http://*[^/]*";
  119.  
  120.             return Regex.Match(oldurl, regexp).Value;
  121.  
  122.         }
  123.     }
  124. }

No comments:

Post a Comment