Velocity Reviews - Computer Hardware Reviews

Velocity Reviews > Newsgroups > Programming > Python > Problem with uudecode

Reply
Thread Tools

Problem with uudecode

 
 
Juho Saarikko
Guest
Posts: n/a
 
      05-25-2004
I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:

Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9

Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8

As you can see, one of the last two bytes gets altered in all cases.

The script also outputs the decoded file to disk for debugging purposes,
and the database large object and filesystem file match so it can't be a
PostGreSQL problem.

So, if anyone has any idea what is wrong, please tell me ? I can't found
any reason why the bytes would get mangled...

The script follows:

#!/usr/local/bin/python2.3

# Insert message contents into the database, for each message-id already there
#
# Copyright 2004 by Juho Saarikko
# License: GNU General Public License (GPL) version 2
# See www.gnu.org for details

from pyPgSQL import libpq
import nntplib
import sys
import string
import regex
import sha
import imghdr
import binascii
import StringIO
import os

def strip_trailing_dots(n):
tmp = []
for i in range(len(n)):
if n[i][-1] == "," or n[i][-1] == ".":
tmp.append(n[i][:-1])
else:
tmp.append(n[i])
return tmp

def findmimetype(body, filename):
tail4 = string.lower(filename[-5:])
tail3 = string.lower(filename[-4:])
if tail4 == ".jpeg":
return "image/jpeg"
if tail3 == ".jpg":
return "image/jpeg"
if tail3 == ".png":
return "image/png"
if tail3 == ".jpe":
return "image/jpeg"
if tail3 == ".gif":
return "image/gif"
return None

def insert_picture(conn, image, filename):
hash = sha.new(image)
qhash = libpq.PgQuoteBytea(hash.digest())
candidates = conn.query("SELECT id, picture FROM pictures WHERE hash = " + qhash )
if candidates.ntuples > 0:
print "Found possible mathces " + str(candidates.ntuples)
for x in range(candidates.ntuples):
old = candidates.getvalue(x, 1)
old.open("r")
oldpic = old.read()
old.close()
if oldpic == image:
print "Found a match"
ret = (candidates.getvalue(x,0), 1)
return ret
mime = findmimetype(image, filename)
print "attempting to get mimetype"
if mime == None:
print "No mimetype found"
ret = (0, 0)
return ret
mime = libpq.PgQuoteString(mime)
mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
if mimeres.ntuples == 0:
conn.query("INSERT INTO mimetypes (mimetype) VALUES (" + mime + ")")
mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
mimetype = mimeres.getvalue(0,0)
picture = conn.lo_creat("rw")
picture.open("rw")
picture.write(image)
picture.close()
tmp = conn.query("INSERT INTO pictures (hash, mimetype, picture) VALUES (" + qhash + ", " +str(mimetype) + ", " + picture.name + ")")
temp = conn.query("SELECT id FROM pictures WHERE OID = " + str(tmp.oidValue))
id = temp.getvalue(0,0)
ret = (id, 0)
return ret

def try_decode_and_insert_uuencoded(conn, id):
begin = regex.compile("begin [0-9]+ \(.*\)")
conn.query("BEGIN")
basedir = "kuvat"
message = conn.query("SELECT data FROM fragments_bodies WHERE message = " + str(id) + " ORDER BY line")
# print message.ntuples

keywords = []
picids = []
newpicids = []
n = 0
s = ""
picid = 0
print 'Starting message id ' + str(id)
while n < message.ntuples:
# print "length of row " + str(n)
# print str(message.getlength(n, 0))
# print "Got length"
abcddummy = message.getvalue(n, 0)
# print "Got value"
s = message.getvalue(n, 0)
# print "Got s"
if begin.match(s) > 0:
# if match_beginning(s) > 0:
# print "Begin matched"
body = []
file = begin.group(1)
# file = get_file_name(s)
# print "Starting to decode, at line " + str(n + 1)
for k in range(n+1, message.ntuples):
# print "Decodind row " + str(k)
s = message.getvalue(k, 0)
if s[:3] == "end":
n = k + 1
break
try:
body.append(binascii.a2b_uu(s))
except:
try:
bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
body.append(binascii.a2b_uu(s[:bytes]))
except:
print "Broken attachment in message " + str(id)
conn.query("ROLLBACK")
return
# print "Got to end, at line " + str(n)
# print "Attempting to join body"
body = string.join(body, "")
# print "Attempting to hash body"
# hash = sha.new(body)
# qhash = libpq.PgQuoteBytea(hash.digest())
# qbody = libpq.PgQuoteBytea(body)
# print "Attempting to find whether the pic already exists"
print "Mimetype returned " + str(findmimetype(body, file))
# temporary = open("dummy", "wb")
# temporary.write(body)
# temporary.close()
# dummy.write("dsfds")
print "Calling insert function"
picid, exists = insert_picture(conn, body, file)
print "Returned from insert function with value " + str(picid)
if picid > 0:
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# if already.ntuples == 0:
# print "Attempting to find mimetype"
# mimetype = findmimetype(body, file)
# print "Found mimetype"
# if mimetype != None:
# o = conn.query("INSERT INTO pictures (picture, hash, mimetype) VALUES (" + qbody + ", " + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# already = conn.query("SELECT id FROM pictures WHERE OID = " + str(o.oidValue()));
# already = conn.query("SELECT id FROM pictures WHERE data = " + qbody)
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to insert hash and mimetype"
# conn.query("INSERT INTO pictures (hash, mimetype) VALUES (" + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# print "Attempting to get id"
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to get value"
# picid = already.getvalue(0, 0)
print picid
print "Attempting to OK dir"
if os.access(basedir + "/tmp", os.F_OK) != 1:
os.mkdir(basedir + "/tmp")
fh = open(basedir + "/tmp/" + str(picid), "wb")
fh.write(body)
fh.close()
print "File ok"
picids.append(picid)
if exists == 0:
newpicids.append(picid)
if file != "":
keywords.append(file)
# else:
# picid = already.getvalue(0, 0)
# if already.ntuples == 0:
# conn.query("ROLLBACK")
# return
# picids.append(picid)
# if already.ntuples == 0:
# print "already.ntuples == 0, ROLLBACKing"
# conn.query("ROLLBACK")
# return
# print "Appending picid"
# picids.append(picid)
# print "Picid appended"
else:
tmptmp = string.split(s)
tmpkey = strip_trailing_dots(tmptmp)
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
# print "Adding 1 to n"
n = n + 1
if len(picids) > 0:
print "Found " + str(len(picids)) + " pictures (" + str(len(newpicids)) + " new ones)"
# print "Finding Subject"
head = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ilike 'Subject')")
if head.ntuples > 0:
# print "Splitting Subject"
blah = head.getvalue(0,0)
# print str(blah)
blahblah = string.split(str(blah))
# print "Stripping"
abctmpkey = strip_trailing_dots(blahblah)
# print "Stripping done"
# print "Really"
tmpkey = abctmpkey
# print "Subject split"
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
o = conn.query("INSERT INTO messages DEFAULT VALUES")
mid = conn.query("SELECT id FROM messages WHERE OID = " + str(o.oidValue))
messageid = mid.getvalue(0, 0)
nresult = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ILIKE 'Newsgroups')")
if nresult.ntuples > 0:
for x in range(nresult.ntuples):
newsgroups = string.split(nresult.getvalue(x, 0), ",")
if len(newsgroups) > 0:
for y in range (len(newsgroups)):
newsgroup = libpq.PgQuoteString(newsgroups[y])
ngroupres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
if ngroupres.ntuples > 0:
newsgid = ngroupres.getvalue(0, 0)
else:
conn.query("INSERT INTO newsgroups (name) VALUES (" + newsgroup + ")")
ngrtmpres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
newsgid = ngrtmpres.getvalue(0, 0)
conn.query("INSERT INTO messages_ngroups_glue (message, newsgroup) VALUES (" + str(messageid) + ", " + str(newsgid) + ")")
else:
print "An empty Newsgroups: header at messag " + str(id)
conn.query("ROLLBACK")
return
else:
print "No Newsgroups: header at message " + str(id)
conn.query("ROLLBACK")
return
for x in range(len(picids)):
conn.query("INSERT INTO messages_pictures_glue (message, picture) VALUES (" + str(messageid) + ", " + str(picids[x]) + ")")
if len(keywords) > 0:
for x in range(len(tmpkey)):
qword = libpq.PgQuoteString(str(keywords[x]))
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
if tmp.ntuples == 0:
conn.query("INSERT INTO keywords_words (keyword) VALUES (" + qword + ")")
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
keyid = str(tmp.getvalue(0, 0))
for y in range(len(picids)):
conn.query("INSERT INTO keywords_glue(word, picture) VALUES (" + keyid + ", " + str(picids[y]) + ")")
dummyone = "SELECT fragments_header_contents.line, fragments_header_names.header,"
dummytwo = " fragments_header_contents.contents FROM fragments_header_names, fragments_header_contents"
dummythree = " WHERE fragments_header_contents.message = " + str(id)
dummyfour = " AND fragments_header_contents.header = fragments_header_names.id"
head = conn.query(dummyone + dummytwo + dummythree + dummyfour)
if head.ntuples > 0:
for h in range(head.ntuples):
qhead = libpq.PgQuoteString(str(head.getvalue(h, 1)))
qcont = libpq.PgQuoteString(str(head.getvalue(h, 2)))
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
if tmp.ntuples == 0:
conn.query("INSERT INTO header_names (header) VALUES (" + qhead + ")")
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
headid = str(tmp.getvalue(0, 0))
line = str(head.getvalue(0, 0))
conn.query("INSERT INTO header_contents (header, message, line, contents) VALUES (" + headid + ", " + str(messageid) + ", " + line + ", " + qcont + ")")
conn.query("DELETE FROM fragments_header_contents WHERE message = " + str(id))
conn.query("DELETE FROM fragments_bodies WHERE message = " + str(id))
conn.query("COMMIT")
if len(newpicids) > 0:
tmpdir = basedir + "/tmp/"
for i in range(len(newpicids)):
picid = newpicids[i]
tmppicname = tmpdir + str(picid)
permpicname = basedir + "/" + str(picid%1000) + "/" + str(picid)
print tmppicname
print permpicname
if os.access(basedir + "/" + str(picid%1000), os.F_OK) != 1:
os.mkdir(basedir + "/" + str(picid%1000))
os.link(tmppicname, permpicname)
os.unlink(tmpdir +str(picid))
else:
print "No pictures found"
conn.query("ROLLBACK")
return


database = libpq.PQconnectdb('dbname = kuvat')
items = database.query("SELECT message FROM whole_attachments")

# try_decode_and_insert_uuencoded(database, 5407)

for i in range(items.ntuples):
try:
print 'Starting call ' + str(i)
try_decode_and_insert_uuencoded(database, items.getvalue(items.ntuples - 1 - i,0))
print ' returned from call ' + str(i)
except:
print 'Some other error occurred at message " + str(i) + ", trying to continue...'


 
Reply With Quote
 
 
 
 
Ville Vainio
Guest
Posts: n/a
 
      05-25-2004
>>>>> "Juho" == Juho Saarikko <(E-Mail Removed)> writes:

Juho> I made a Python script which takes Usenet message bodies
Juho> from a database, decodes uuencoded contents and inserts them
Juho> as Large Object into a PostGreSQL database. However, it
Juho> appears that the to last few bytes

I skimmed through your program, and noticed that you use binascii
module uuencode/decode. Have you given the "uu" module a try, to see
if it works better?

Also, get rid of "regex" module, it even gives a DeprecationWarning
suggesting switching to "re".

--
Ville Vainio http://tinyurl.com/2prnb
 
Reply With Quote
 
 
 
 
Juho Saarikko
Guest
Posts: n/a
 
      05-25-2004
On Tue, 25 May 2004 22:04:24 +0300, Ville Vainio wrote:

>>>>>> "Juho" == Juho Saarikko <(E-Mail Removed)> writes:

>
> Juho> I made a Python script which takes Usenet message bodies
> Juho> from a database, decodes uuencoded contents and inserts them
> Juho> as Large Object into a PostGreSQL database. However, it
> Juho> appears that the to last few bytes
>
> I skimmed through your program, and noticed that you use binascii
> module uuencode/decode. Have you given the "uu" module a try, to see
> if it works better?


I did examine the uu module, but it would seem that I'd had to parse the
message first anyway to get the file name and the non-binary parts of the
message as keywords. Besides, as I understand it, the uu module uses the
binascii module, so if there's something wrong with the binascii module,
the uu module can't possibly work well.

Oh well, I would had to write the parsing engine anyway (or learn to
use the e-mail classes), to properly handle mime and yenc messages. And I
suppose I'd better start using imagemagic to verify the mimetype of
decoded files, instead of just believing the filename. And join together
files that have been spread over multiple messages. Work, work, work...

> Also, get rid of "regex" module, it even gives a DeprecationWarning
> suggesting switching to "re".


I would, if I knew how to make regular expressions; I found the uu-parsing
snippet from the net and built my script around it, but the
regular expression doesn't seem to work with the re module.
 
Reply With Quote
 
Steve Holden
Guest
Posts: n/a
 
      05-25-2004
Juho Saarikko wrote:
> I made a Python script which takes Usenet message bodies from a database,
> decodes uuencoded contents and inserts them as Large Object into a
> PostGreSQL database. However, it appears that the to last few bytes
> of uudecoded data are always mangled. Take a look of this hexdump output:
>
> Originals (decoded with Pan, each line is from a different file):
> 000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
> 0011a10 ff54 00d9
> 00093e0 fb4f a80d ffd9 c200 ffef 00d9
>
> Decoded by the script:
> 000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
> 0011a10 ff54 00d8
> 00093e0 fb4f a80d ffd9 c200 ffef 00d8
>
> As you can see, one of the last two bytes gets altered in all cases.
>
> The script also outputs the decoded file to disk for debugging purposes,
> and the database large object and filesystem file match so it can't be a
> PostGreSQL problem.
>
> So, if anyone has any idea what is wrong, please tell me ? I can't found
> any reason why the bytes would get mangled...
>
> The script follows:
>

[...]
I note that you are dumping words rather than bytes. Is it possible that
the last byte isn't actually a part of the file, that
endianness makes the last byte look like the penultimate byte, and that
what you are seeing is simply noise?

If not then it should probably be looked into ...

regards
Steve
 
Reply With Quote
 
Juho Saarikko
Guest
Posts: n/a
 
      05-25-2004
On Tue, 25 May 2004 18:54:44 -0400, Steve Holden wrote:

> I note that you are dumping words rather than bytes. Is it possible that
> the last byte isn't actually a part of the file, that
> endianness makes the last byte look like the penultimate byte, and that
> what you are seeing is simply noise?


Well, ImageMagick complains that the image contains errors (altought
Eye of Gnome shows it with no artifacts), so it's likely to be part of the
file itself.

I get both

"display: Premature end of JPEG file"

and

"display: Invalid JPEG file structure: two SOI markers"

errors. The later error prevent ImageMagick's display-command from
displaying the image (but not Eye of Gnome).

> If not then it should probably be looked into ...


Looked, looked, but where to start ? The bug could be anywhere from my
script to binascii module to the nntp module to the string.join -function.

 
Reply With Quote
 
Tim Roberts
Guest
Posts: n/a
 
      05-26-2004
Juho Saarikko <(E-Mail Removed)> wrote:

>I made a Python script which takes Usenet message bodies from a database,
>decodes uuencoded contents and inserts them as Large Object into a
>PostGreSQL database. However, it appears that the to last few bytes
>of uudecoded data are always mangled. Take a look of this hexdump output:
>
>Originals (decoded with Pan, each line is from a different file):
>000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
>0011a10 ff54 00d9
>00093e0 fb4f a80d ffd9 c200 ffef 00d9
>
>Decoded by the script:
>000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
>0011a10 ff54 00d8
>00093e0 fb4f a80d ffd9 c200 ffef 00d8
>
>As you can see, one of the last two bytes gets altered in all cases.


As others have pointed out, it's really the last byte that is getting
altered.

> for k in range(n+1, message.ntuples):
># print "Decodind row " + str(k)
> s = message.getvalue(k, 0)
> if s[:3] == "end":
> n = k + 1
> break
> try:
> body.append(binascii.a2b_uu(s))
> except:
> try:
> bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
> body.append(binascii.a2b_uu(s[:bytes]))
> except:
> print "Broken attachment in message " + str(id)
> conn.query("ROLLBACK")
> return


Your computation of the number of bytes in the uuencoded string will come
up one short: you're not accounting for the length byte. That will have
exactly the effect you describe. You lose the last encoded character,
which means you'll miss the last 6 bits of the file. Change it to this:

bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3 + 1

However, you should not need to wrap the first binascii.a2b_uu call with
try/except at all. What is happening that causes the error in the first
place? I suspect if you fix the root cause, you could eliminate the except
clause altogether.
--
- Tim Roberts, http://www.velocityreviews.com/forums/(E-Mail Removed)
Providenza & Boekelheide, Inc.
 
Reply With Quote
 
 
 
Reply

Thread Tools

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

BB code is On
Smilies are On
[IMG] code is On
HTML code is Off
Trackbacks are On
Pingbacks are On
Refbacks are Off


Similar Threads
Thread Thread Starter Forum Replies Last Post
uuDecode problem py Python 9 12-10-2005 02:39 AM
How do I create a UUDECODE program in C or c++ Anonieko Ramos Computer Security 0 07-20-2004 11:40 PM
Problem problem problem :( Need Help Mike ASP General 2 05-11-2004 08:36 AM
Is there a Free UUDECODE program out? Anonieko Ramos Computer Security 0 05-08-2004 02:23 PM
Help please : UUDecode problem - complete Java app included barry Java 0 12-18-2003 05:01 PM



Advertisments