Files
wagic/projects/mtg/tools/gatherer/soup-parser.py
wagic.the.homebrew d45e3b101b
2008-11-02 09:50:16 +00:00

111 lines
3.3 KiB
Python

#!/usr/bin/python
#
# Requires BeautifulSoup verion 3, available from
# http://www.crummy.com/software/BeautifulSoup/
#
# Usage soup-parser.py <xml file> [...]
#
# Updates text, subtype and name for all cards in the given xml file/s
#
# *** Overwrites the file/s given... use copies ***
#
# Copyright 2006: Nathan Callahan
# Feel free to do what you want with this file, but give credit
# where it's due.
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag
import re
import codecs
import sys
import urllib
from mtgCommon import *
stripReminderText = True
def matchNames(tag, name):
if tag.name == 'card':
return tag['name'].translate(nametrans) == name.translate(nametrans)
else:
return False
def doGathererUpdate(xml, soup):
xmlCards = [c['name'] for c in xml.findAll('card')]
gathererCards =[]
for cardRow in soup.find(id="_gridResults").findAll('tr',onmouseover="this.style.backgroundColor='#F5DEB3';"):
name = cardRow('b')[0].string
name = name.replace('"','&quot;')
htmlText = cardRow('td')[3]
htmlText = replaceSymbols(htmlText)
text = cleanupHTMLText(htmlText, stripReminderText)
supertype, subtype = getCardTypes(cardRow)
splitCard = split_re.match(text)
if splitCard:
text = splitCard.group('t1') + " // " + splitCard.group('t2')
supertype = supertype + " // " + splitCard.group('type2')
cards = xml.findAll(lambda tag: matchNames(tag, name))
if cards:
for card in cards:
card['name']=name
card['text']=text
card.find('attr',key='type')['value']=supertype
if subtype:
s = card.find('attr', key='subtype')
if not s:
card.insert(-1,Tag(xml,"attr"))
s = card('attr')[-1]
s['key'] = 'subtype'
s['value']=subtype
costTag = card.find('attr',key='cost')
# Remove some useless attributes from previous versions
if costTag:
if not costTag['value']:
costTag.extract()
subtypeTag = card.find('attr',key='subtype')
if subtypeTag:
if not subtypeTag['value']:
subtypeTag.extract()
xmlCards.remove(name)
else:
gathererCards.append(name)
if xmlCards:
print "Cards in file not found in gatherer:"
print ' ' + '\n '.join(xmlCards)
if gathererCards:
print "Cards in gatherer not found in file:"
print ' ' + '\n '.join(gathererCards)
for arg in sys.argv[1:]:
print 'Reading:', arg
f = file(arg)
xml = BeautifulStoneSoup(f.read(),selfClosingTags=['attr'])
f.close()
setinfo = sets[xml.find('ccg-setinfo')['abbrev']]
url = "http://ww2.wizards.com/gatherer/Index.aspx?setfilter=%s\\&output=Spoiler" % setinfo['gathname']
print 'Fetching/Parsing:', url
gatherer = urllib.urlopen(url)
soup = BeautifulSoup(gatherer.read(), smartQuotesTo=None)
print 'Processing:' + arg
doGathererUpdate(xml,soup)
print 'Writing:', arg
f = file(arg,'w')
f.write(xml.prettify('ISO-8859-1'))
f.close()