class Amazon { // This is an array of Category objects to process var queue = new array // This is a set of seen category codes (as integer) var seen = new set // This is a dictionary of var hierarchy = new dict processQueue[] := { while length[queue] > 0 { cat = queue.popFirst[] readPage["http://www.amazon.com/" + cat.urlpart + "/zgbs/books/" + cat.id + "/", cat.id] // println["processing " + cat] // println[hierarchy] // println[length[queue] + " items in queue."] } } readPage[url, parentID] := { page = read[url, "windows-1252"] // println[page] // for [urlpart, index, title] = parts = page =~ %r/['"]http:\/\/www\.amazon\.com\/([^\/]+)\/zgbs\/books\/(\d+)\/[^"']*?['"]\s*>([^<]+)/g for [urlpart, index, title] = parts = page =~ %r/['"]http:\/\/www\.amazon\.com\/([^\/]+)\/zgbs\/books\/(\d+)[^'"]*['"]\s*>([^<]+)/g { addQueue[urlpart, parseInt[index], parentID, title] } } // Add a new category to the queue addQueue[urlpart, id, parentID, title] := { if seen.contains[id] or id == parentID return seen.put[id] // println["Adding $title"] parent = hierarchy@parentID parentTitle = (parent != undef) ? parent.title + " | " : "" fullTitle = parentTitle + title cat = new Category[urlpart, id, parentID, fullTitle] println["$id\t$parentID\t$fullTitle"] queue.push[cat] hierarchy@id = cat } } class Category { var urlpart var id var parentID var title new[url, i, parent, t] := { urlpart = url id = i parentID = parent title = t } } a = new Amazon a.readPage["http://www.amazon.com/gp/bestsellers/books/ref=sv_b_2", undef] a.processQueue[] //println[a.hierarchy]