--TOjet6/crW Content-Type: text/plain; charset=us-ascii Content-Description: message body text Content-Transfer-Encoding: 7bit I wrote a small script to extract patch meta data from the SourceForge patch manager and install them in a local Postgres database. This code might be useful for other folks, although the second half (the Postgres half) depends on having PyGreSQL installed. I'm posting the current source here in case anyone else wants to fiddle. It's definitely rough. Attached is the code and the DB schema. Jeremy --TOjet6/crW Content-Type: text/plain Content-Description: python script Content-Disposition: inline; filename="patchtool.py" Content-Transfer-Encoding: 7bit """Screen scraper for Patch Manager interface The patch form URL is http://www.sourceforge.net/patch/index.php. GET method If I'm lucky, it can be used without authentication. the input fields are: (* means hidden field) *group_id=5470 *custom=set _assigned_to=None _status=None This script produces the following HTML for each entry: <TR BGCOLOR='#FFFFFF'> <TD><A HREF='?func=detailpatch&patch_id=100518&group_id=5470'>100518</A> </TD> <TD>fix bltinmodule.c for 64-bit platforms</TD> <TD>2000-Jun-07 03:21</TD> <TD>gvanrossum</TD> <TD>tmick</TD></TR> If there are more than 50 patches, the following HTML is produced: <TR><TD COLSPAN='2'> </TD><TD> </TD><TD COLSPAN='2'><A HREF='?func=browse&group_id=5470&set=custom&_assigned_to=100&_status=100&offset=50'><B>Next 50 --></B></A></TD></TR></TABLE> <!-- end content --> Future plans: support authentication command-line interface for modifying patches """ import cgi import re import types from urllib import urlencode from urlparse import urljoin from urllib2 import urlopen import pg VERBOSE = 0 DATABASE = 'jeremy' class PatchListParser: """Minimal re-based parsed that grabs relevant URLs from summary""" rx_href = re.compile('HREF="([?/=&_A-Za-z0-9]+)"') def parse_hrefs(self, buf): hrefs = [] offset = 0 while 1: mo = self.rx_href.search(buf, offset) if mo is None: break offset = mo.end(1) hrefs.append(mo.group(1)) return hrefs def get_query_hrefs(self, buf): queries = [] for href in self.parse_hrefs(buf): if href[0] == '?': queries.append(href) return queries class PatchParser: """Minimal re-based parser that pulls key-values from patch page""" rx_entry = re.compile('<TD[^>]*><B>(.+):</B><BR>(.+)</TD>') def parse(self, buf): entries = {} offset = 0 while 1: mo = self.rx_entry.search(buf, offset) if mo is None: break offset = mo.end(2) k, v = mo.group(1, 2) entries[k] = v return entries class SQLMapping: """Decode a simple mapping from an SQL table Assumes that the keys and values are disjoint, so that a single interface can resolve in either direction. """ def __init__(self, db, table, fields="*"): self.dict1 = {} self.dict2 = {} r = db.query("SELECT %s FROM %s" % (fields, table)).getresult() for key, val in r: assert None not in (key, val) self.dict1[key] = val self.dict2[val] = key def lookup(self, kv): r = self.dict1.get(kv) if r is None: r = self.dict2.get(kv) return r def urldecode(query): d = cgi.parse_qs(query) for k, v in d.items(): if len(v) != 1: raise ValueError, "unexpected duplicate entry" d[k] = v[0] return d class PatchManager: url = "http://www.sourceforge.net/patch/index.php" group_id = 5470 list_parser = PatchListParser() patch_parser = PatchParser() # XXX to get the right numeric values for assigned_to and status, # would need to scrape them out of the form... def get_patches(self, assigned_to='100', status='100'): assert type(assigned_to) == types.StringType assert type(status) == types.StringType url = self._get_initial_query(assigned_to, status) patch_list = self._load_patch_summary(url) patches = {} for patch_id, p in patch_list: patches[patch_id] = self._load_patch_detail(p) return patches def _get_initial_query(self, assigned_to, status): dict = {'group_id': self.group_id, 'set': 'custom', 'SUBMIT': 'Browse', '_assigned_to': assigned_to, '_status': status, } query = urlencode(dict) return "%s?%s" % (self.url, query) def _load_patch_summary(self, url): todo = [(url, 0)] patches = [] offset = 0 while todo: url, offset = todo[0] del todo[0] buf = urlopen(url).read() for href in self.list_parser.get_query_hrefs(buf): d = urldecode(href[1:]) if d['func'] == 'detailpatch': patches.append((int(d['patch_id']), urljoin(self.url, href))) elif d['func'] == 'browse': new_offset = int(d['offset']) if new_offset > offset: todo.append((urljoin(self.url, href), new_offset)) return patches def _load_patch_detail(self, url): buf = urlopen(url).read() return self.patch_parser.parse(buf) class PatchDBInterface: """Interface between the PatchManager and the SQL database Scheme for the patches table is: CREATE TABLE patches_t ( patch_id int PRIMARY KEY, summary text, status int REFERENCES status_t, category text, date text, submitted_by int REFERENCES users_t, assigned_to int REFERENCES users_t, summary_url text ); """ def __init__(self, db): self.db = db self.users = SQLMapping(db, 'users_t') self.status = SQLMapping(db, 'status_t') self.num_deletes = 0 def update(self, patch_id, attrs): # resolve REFERENCES status = self.status.lookup(attrs['Status']) submitted_by = self.users.lookup(attrs['Submitted By']) if submitted_by is None: submitted_by = 0 assigned_to = self.users.lookup(attrs['Assigned To']) if assigned_to is None: assigned_to = 100 # delete old version if necessary if self.has_patch(patch_id): q = "DELETE FROM patches_t WHERE patch_id = %(patch_id)d" self.db.query(q % locals()) self.num_deletes = self.num_deletes + 1 d = locals() del d['attrs'] # just to make debugging prints clearer for k, v in attrs.items(): d[k] = pg._quote(v, 0) q = "INSERT INTO patches_t VALUES (%(patch_id)d," \ " %(Summary)s, %(status)d, %(Category)s, %(Date)s," \ " %(submitted_by)d, %(assigned_to)d)" self.db.query(q % d) def has_patch(self, patch_id): r = self.db.query("SELECT * FROM patches_t" \ " WHERE patch_id = %d" % patch_id).getresult() if r: return 1 else: return 0 if __name__ == "__main__": import sys import getopt opts, args = getopt.getopt(sys.argv[1:], 'vd:') assert len(args) == 0 for k, v in opts: if k == '-v': VERBOSE = 1 elif k == '-d': DATABASE = v pmgr = PatchManager() if VERBOSE: print "Loading patches" p = pmgr.get_patches() if VERBOSE: print "Retrieved %d patches" % len(p) if VERBOSE: print "Inserting into local database" db = pg.connect(DATABASE) pdbi = PatchDBInterface(db) for p_id, attrs in p.items(): pdbi.update(p_id, attrs) if VERBOSE: new = len(p) - pdbi.num_deletes print "Found %d new patches" % new print "Updated %d existing patches" % pdbi.num_deletes --TOjet6/crW Content-Type: text/plain Content-Description: database schema Content-Disposition: inline; filename="patch-schema.txt" Content-Transfer-Encoding: 7bit CREATE TABLE users_t ( user_id int PRIMARY KEY, username text NOT NULL ); CREATE TABLE status_t ( status_id int PRIMARY KEY, name text NOT NULL ); CREATE TABLE patches_t ( patch_id int PRIMARY KEY, summary text, status int REFERENCES status_t, category text, date text, submitted_by int REFERENCES users_t, assigned_to int REFERENCES users_t, summary_url text ); --TOjet6/crW--
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4