Rainer Weikusat <> writes:
[...]
>>> If you want an opinion on this: If a single line of input is too large
>>> to be kept in memory, Perl is decidedly the wrong choice for solving
>>> this problem.
>>
>> I strongly disagree. IMHO a file with lines that long simply isn't
>> meaningfully a 'text file' any more, and so needs to be handled like a
>> binary file: read in blocks, and remember byte positions. Perl is
>> perfectly capable of handling binary data.
>
> As opposed to something sensible such as memory-mapping the input
> file (or that part of it which fits into the process address space),
> the overhead is going to be grotesque
For sake of completeness and because I felt like doing it: Here's a
seek-based variant based on reading data in 'blocks' (of an arbitrary
size > 0) which actually works (according to my limited testing, still
without error handling).
----------------------------
#!/usr/bin/perl
#
use constant BLOCK => 4096;
sub read_block
{
my ($block, $rc);
$rc = sysread($_[0], $block, $_[1] // BLOCK);
$rc // die("sysread: $!");
$_[1] && $rc != $_[1] && die("short read");
return $rc ? $block : undef;
}
sub get_ids
{
my ($in, $ids) = @_;
my ($block, $id, $want, $bpos, $sbpos, $fpos);
$want = "\t";
while ($block = read_block($in)) {
$sbpos = $bpos = 0;
{
$bpos = index($block, $want, $sbpos);
if ($want eq "\t") {
if ($bpos != -1) {
$id .= substr($block, $sbpos, $bpos - $sbpos);
push(@$ids, [$id, $fpos + ++$bpos]);
$id = '';
$want = "\n";
$sbpos = $bpos;
redo if $sbpos < length($block);
} else {
$id .= substr($block, $sbpos);
}
last;
}
if ($want eq "\n") {
last if $bpos == -1;
push(@{$ids->[$#$ids]}, $fpos + $bpos);
$want = "\t";
$sbpos = $bpos + 1;
redo if $sbpos < length($block);
}
}
$fpos += length($block);
}
}
sub print_id_data
{
my ($fh, $id) = @_;
my ($blocks, $len);
seek($fh, $id->[1], 0);
$len = $id->[2] - $id->[1];
$blocks = int($len / BLOCK);
print(read_block($fh, BLOCK)) while ($blocks--);
$len %= BLOCK;
print(read_block($fh, $len)) if $len;
}
{
my ($fh, @ids);
open($fh, '<', $ARGV[0]) // die("open: $ARGV[0]: $!");
get_ids($fh, \@ids);
print_id_data($fh, $_) for sort { $a->[0] cmp $b->[0] } @ids;
}
|