I was playing with pulling images out of PDF files tonight. Thought I'd share. --Darrell import Image, re, zlib, sys def stripImages(fn): buf= open(fn,'rb').read() fnS= fn.split(".")[0] s = re.findall("(?s)/XObject\s+/Subtype\s+/Image(.*?)stream\s*\012(.*?)endstream ", buf) print len(s) for i in s: try: name = re.findall("(?i)/name\s+/(\w+)",i[0])[0] width= re.findall("(?i)/Width\s+(\d+)",i[0])[0] height= re.findall("(?i)/Height\s+(\d+)",i[0])[0] filter = re.findall("(?i)/filter\s+/(\w+)",i[0])[0] colorSpace = re.findall("(?i)/ColorSpace\s+/(\w+)",i[0])[0] except IndexError: print "Skip:", i[0] continue print "Found:", name, width, height, filter, colorSpace if filter=="FlateDecode": im = zlib.decompress(i[1]) im = Image.fromstring("RGB", (int(width),int(height)), im) im.save("%s_%s.jpg"%(fnS,name)) elif filter == "DCTDecode": open("%s_%s.jpg"%(fnS,name),'wb').write(i[1]) stripImages(sys.argv[1])
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4