########################################################################### # # robots.txt file for www.gamers.org # # 29-Jan-1997 jve Created. # 02-Feb-1997 jve Allow indexing of specific, unique mirrors. # 04-Feb-1997 jve Allow indexing of dEngine. # 21-Mar-1997 jve Add comprehensive 3daction section, tweak others. # 28-Mar-1997 jve Corrected exclusion standard URL. # Fixed formatting of 'record' by adding # lines. # Added Teleport Pro section until Apache fix made. # 17-Apr-1997 jve Added /forums/HyperNews after noticing attempts # by atext.com and inktomi.com to access (401 errs). # 13-Dec-1997 jve Added 3daction/{00bymultiplay,00cheats,00reviews}. # Added lots of duplicate (symlink) exclusions. # 27-Dec-1997 jve Added /pub/games/{claw,hexen2,quake2,wolf3d} # 01-Feb-1998 jve Added /cgi/split (and /split just in case). # 08-Mar-1998 jve Added 3drealms-related paths. # 18-Mar-1998 jve Added /~bk to exclusions, also added # 3daction/{duke3d,shadowwarrior,speardestiny}/addons. # 31-Mar-1998 jve Added /3daction. # 15-May-1998 jve Removed 3daction/speardestiny (was already Pre-Doom), # Allowed 3daction/{hexen2,halflife,sin}/releases. # 20-Dec-1998 JVE Added ~rkinion/sports/cougboard. # 02-Jan-1999 JVE Added /pub/incoming (sheesh). # 03-Mar-1999 JVE Added Teleport Pro 1.29 # 09-May-2001 JVE Added /pub/3d* (everything now under 3dgamers) # Removed the 3daction section since it's gone # 27-Sep-2001 FPV Added /pub/games/idgames* & unreal # 30-Mar-2006 JVE Added agents from 3dgamers robots.txt # 17-Apr-2011 JVE Added loc-crawler # 22-Jul-2012 FPV Added ~fpv/ subdirs # 14-Jun-2014 FPV Added 007ac9 crawler # 28-Dec-2015 FPV Removed ~fpv/ subdirs # 23-Feb-2021 FPV Added MJ12bot crawler # 10-Apr-2021 FPV Added /cgi/IdgamesUpload.html # 24-Jan-2024 FPV Added SemrushBot crawler # # >>--> REMEMBER!! Synchronize /usr/local/lib/site_perl/Log_support.pm ! # # See robot exclusion standard at # http://info.webcrawler.com/mak/projects/robots/norobots.html # # One or more records (separated by one or more blank lines) # consisting of one or more User-agent fields followed by # one or more Disallow fields. # ########################################################################### ############# Exclusions section for specific robots ################# #### Exclude loc-crawler - it gets at high speed w/no delay # accessing from lx8.loc.gov 140.147.249.70 starting April 15 2011 # User-agent: Mozilla/5.0 (compatible; loc-crawler Disallow: /pub #### Exclude TAGENT - it requests robots.txt before every GET # and GETs files too quickly. Here is a sample from the access log: # sv.tkensaku.com - - [22/Jan/2002:11:38:05 -0500] "GET /robots.txt HTTP/1.0" 200 210 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:06 -0500] "GET /reviews/ HTTP/1.0" 200 14750 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:08 -0500] "GET /robots.txt HTTP/1.0" 200 210 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:09 -0500] "GET /previews/ HTTP/1.0" 200 9163 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:10 -0500] "GET /robots.txt HTTP/1.0" 200 210 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:12 -0500] "GET /articles/ HTTP/1.0" 200 9489 "TAGENT/V0.5" # User-agent: TAGENT Disallow: / #### Exclude Teleport Pro # # Teleport Pro has a bug where it interprets HREF=".." as a file and # constructs and submits bad URLs, resulting in many Not Found errors. # Apache should redirect URIs ending in ".." to the 'real' directory. # User-agent: Teleport Pro Disallow: / #### Exclude AlkalineBOT # # On 10-Mar-2002 from remote host syr-24-95-161-196.twcny.rr.com # User-agent: AlkalineBOT Disallow: / #### Exclude Whizbang (see http://www.whizbang.com/crawler) # User-agent: Whizbang Disallow: / #### Exclude UniverseBot # # No delay between requests. It strips off trailing slash, thus # triggering redirects. It does both HEAD and GET. Sample: # # 07:18:04 "HEAD /companies/ensemble HTTP/1.0" 301 0 "UniverseBot/1.0" # 07:18:06 "HEAD /companies/ensemble/ HTTP/1.0" 200 0 "UniverseBot/1.0" # 07:18:07 "GET /companies/ensemble HTTP/1.0" 301 247 "UniverseBot/1.0" # 07:18:09 "GET /companies/ensemble/ HTTP/1.0" 200 9961 "UniverseBot/1.0" # User-agent: UniverseBot Disallow: / #### Exclude http://www.almaden.ibm.com/cs/crawler # # We'd like to limit the sites crawling us to the main indexers. # User-agent: http://www.almaden.ibm.com/cs/crawler Disallow: / #### Exclude "SlySearch/1.0 http://www.plagiarism.org/crawler/robotinfo.html" # # This site indexes article for plagiarism checks. # User-agent: SlySearch Disallow: / #### Exclude NG/1.0 # # On 18-Oct-2002 from remote host ng1.exabot.com # # 13:11:35 "GET /news/more/1005254413/d/redir/cb_order/UNRET2003.IR HTTP/1.0" 404 244 "NG/1.0" # 13:11:37 "GET /news/more/1005254413/gi/tattletale/news/ HTTP/1.0" 404 234 "NG/1.0" # 13:11:38 "GET /news/more/1005254413/ews/ HTTP/1.0" 404 219 "NG/1.0" # User-agent: NG/1.0 Disallow: / #### Exclude spider from singingfish.com - no media to index. # User-agent: asterias Disallow: / #### Exclude spider from xo.net - no reason to index our files # User-agent: Gaisbot Disallow: / #### Exclude UbiCrawler # # On 27-Sep-2003 from remote host ubi1.iit.cnr.it # http://ubi.imc.pi.cnr.it/projects/ubicrawler/ # User-agent: UbiCrawler Disallow: / #### Exclude Wget # # It checks this only for recursive operations, not for indiv. files # User-agent: Wget Disallow: / #### Exclude TranSGeniKBot # User-agent: TranSGeniKBot Disallow: / #### Exclude Ocelli/1.1 (http://www.globalspec.com) # User-agent: Ocelli Disallow: / #### Exclude Exabot (http://www.exava.com/) # # Doesn't honor global exclusions. # User-agent: Exabot Disallow: / #### Exclude Pompos (http://www.dir.com/) # # Obscure search site - 1/4 of the URLs have %00 appended. # # Stupid thing requires *no* optional space after User-agent: User-agent:Pompos Disallow: / #### Exclude larbin (http://freshmeat.net/projects/larbin/) # # Open source spider that can be used by anyone. :-/ # User-agent: larbin Disallow: / #### Exclude Nutch (http://www.nutch.org/docs/en/bot.html) # # Open source spider that can be used by anyone. :-/ # User-agent: Nutch Disallow: / #### Exclude Jetbot (http://www.jeteye.com/jetbot.html) # # Doesn't honor global exclusions (it fetches /dl pages). # User-agent: Jetbot Disallow: / #### Exclude Yahoo Slurp (http://help.yahoo.com/l/us/yahoo/search/webcrawler/) # # Slurps tons of binaries too, averaging 2 GB/day # User-agent: Slurp Disallow: / #### Exclude http://crawler.007ac9.net/ # # We'd like to limit the sites crawling us to the main indexers. # User-agent: 007ac9 Crawler Disallow: /pub #### Exclude http://www.cyotek.com/cyotek-webcopy # # Offline viewing tool # User-agent: CyotekWebCrawler Disallow: / #### Exclude https://www.httrack.com/ # # Offline viewing tool # User-agent: HTTrack Disallow: / #### Exclude dotbot (http://www.opensiteexplorer.org/dotbot -> https://moz.com/researchtools/ose/dotbot) # User-agent: dotbot Disallow: / #### Exclude BLEXBot (http://webmeup-crawler.com/) # User-agent: BLEXBot Disallow: / #### Exclude serpstatbot (https://serpstatbot.com/) # User-agent: serpstatbot Disallow: / #### Exclude MJ12bot (http://mj12bot.com/) # # Fetches lots of mangled (wrongly nested) paths. # User-agent: MJ12bot Disallow: / #### Exclude AhrefsBot (http://ahrefs.com/robot/) # User-agent: AhrefsBot Disallow: / #### Exclude Adsbot (https://seostar.co/robot/) # User-agent: Adsbot Disallow: / #### Exclude DataForSeoBot (https://dataforseo.com/dataforseo-bot) # User-agent: DataForSeoBot Disallow: / #### Exclude test-bot (from Amazon EC2) # User-agent: test-bot Disallow: / #### Exclude SemrushBot (http://www.semrush.com/bot.html) # User-agent: SemrushBot Disallow: / #### Exclude claudebot (Amazon EC2) # User-agent: claudebot Disallow: / #### Exclude GPTbot (Microsoft OpenAI) # User-agent: GPTbot Disallow: / #### Exclude FacebookExternalHit (http://www.facebook.com/externalhit_uatext.php) # User-agent: facebookexternalhit Disallow: / #### Exclude Meta-ExternalAgent (https://developers.facebook.com/docs/sharing/webmasters/crawler) # User-agent: meta-externalagent Disallow: / #### Slow down ImagesiftBot # User-agent: ImagesiftBot Crawl-delay: 5 ################ Exclusions section for ALL robots #################### # # These are plain string patterns - not necessarily directory names - # so directories should have trailing slash if substring of another # directory name (like /a is a substring of /about). User-agent: * # Apply exclusions to all robots Disallow: /~bk/ # BK doesn't want robots Disallow: /~javanree/ # JAB doesn't want robots Disallow: /~rkinion/ # Don't index this Disallow: /wtf/ # JAB doesn't want robots Disallow: /cgi/split/ # Don't index split file mechamism Disallow: /split/ # Maybe we'll be using top-level... Disallow: /cgi/IdgamesUpload.html # Upload form for idgames Disallow: /dhs/tmp/ # Obvious... Disallow: /wwwstats/ # Statistics archive Disallow: /messages/ # Messages archive Disallow: /forums/hypermail/ # Messages archive Disallow: /forums/HyperNews/ # HyperNews scripts (protected by Auth) Disallow: /pub/incoming/ # Upload area Disallow: /pub/archives/mailinglists/ # Messages archive # Disallow: /ftptree/ # Don't mirror top of tree (i.e. /.1/ /.2/) # # Only allow from /pub down... Disallow: /pub/mirrors/ # Don't index non-game-name mirror paths # # Allow only one path to a mirror ## allow: /dEngine/ # Real dir Disallow: /pub/3daction/ # Removed during re-org to /pub/3dgamers Disallow: /pub/3dadvent/ # Removed during re-org to /pub/3dgamers Disallow: /pub/3dother/ # Removed during re-org to /pub/3dgamers Disallow: /pub/3drpg/ # Removed during re-org to /pub/3dgamers Disallow: /pub/3dsims/ # Removed during re-org to /pub/3dgamers Disallow: /pub/3dstrategy/ # Removed during re-org to /pub/3dgamers Disallow: /pub/3dgamers/ # No searching - want entry via 3dgamers.com ## allow: /pub/games/3drealms/ # Mirror (->../mirrors/..../3drealms) ## allow: /pub/games/abuse/ # Mirror (->../mirrors/..../abuse) ## allow: /pub/games/claw/ # Real dir ## allow: /pub/games/descent/ # Mirror (->../mirrors/..../descent) Disallow: /pub/games/doom/ # Symlink (->idgames) Disallow: /pub/games/doom2/ # Symlink (->idgames) Disallow: /pub/games/duke3d/ # Symlink (->3drealms/duke3d) Disallow: /pub/games/heretic/ # Symlink (->idgames) Disallow: /pub/games/hexen/ # Symlink (->idgames) Disallow: /pub/games/hexen2/ # Symlink (->idgames2/hexen2) Disallow: /pub/games/idgames/ # Symlink (->../idgames) Disallow: /pub/games/idgames2/ # Symlink (->../idgames2) Disallow: /pub/games/prey/ # Symlink (->3drealms/prey) Disallow: /pub/games/quake/ # Symlink (->idgames2) Disallow: /pub/games/quake2/ # Symlink (->idgames2/quake2) ## allow: /pub/games/strife/ # Real dir Disallow: /pub/games/sw/ # Symlink (->3drealms/sw) Disallow: /pub/games/quake2/ # Symlink (->idgames2/quake2) Disallow: /pub/games/unreal/ # Symlink (->../unreal) Disallow: /pub/games/wolf3d/ # Symlink (->../3daction/Pre-doom/wolf...addons) # # # Don't index duplicate symlinks ## allow: /pub/games/idgames2/idstuff # (->idgames/idstuff) # # # Don't index temporary or short-lived files ## allow: /pub/games/idgames/newstuff/ # (these get moved eventually) ## allow: /pub/games/idgames2/newstuff/ # (these get moved eventually) ## allow: /pub/games/idgames2/demos/ # (files older than 2 mos. deleted)