2017年4月16日日曜日

分割数指定でファイル分割

ファイルを決まった個数に分割したいことはあるが、分割数を指定して分けるコマンドが見当たらず困ることがある。



例えば25行のファイルを8分割したい場合など、split で3行分割指定では3×8+1 の9分割となり、4行分割指定では4×6+1の7分割となり微妙に困る。そんな時、自前で分割数指定でわけるコマンドを持っていると楽できるかもしれない。



1.shellで分割

#! /bin/sh

if [ "$#" != 3 ]
then
    echo "Usage : split2.sh divisor input base"
    exit 1
fi

divisor=$1
input=$2
base=$3

c_start=0
c_end=0
quotient=0
toomach=0
total=0
part=0

total=`wc -l < $input`
quotient=`expr $total / $divisor`
toomuch=`expr $total % $divisor`

divisor=`expr $divisor - 1`
#printf "%d %d %d\n" $total $qutient $toomuch

for no in `seq 0 $divisor`
do
    if [ $no -lt $toomuch ]
    then
        c_start=`expr $c_end + 1`
        c_end=`expr $c_start + $quotient`
    else
        c_start=`expr $c_end + 1`
        c_end=`expr $c_start + $quotient - 1`
    fi
    file=`printf "%s%02d" $base $no`
    cmd=`echo "$cmd -e '$c_start,$c_end w $file'"`

    cnt=`expr $c_end - $c_start + 1`
    printf "Part[%d] Start[%08d] End[%08d] Cnt[%08d]\n" `expr $no + 1` $c_start $c_end $cnt
done

sh -c "sed -n $cmd $input"

2.perlで分割

#! /usr/bin/perl

if (scalar(@ARGV) != 3) {
  printf(STDERR "Usage : split2.pl divisor input base\n");
  exit(1);
}

$divisor  = shift(@ARGV);
$input    = shift(@ARGV);
$base     = shift(@ARGV);

$c_start  = 0;
$c_end    = 0;
$quotient = 0;
$toomach  = 0; 
$total    = 0; 
$part     = 0; 

if (!open(FD,$input)) {
  printf(STDERR "Can't open file[%s]\n",$input);
  exit(1);
}
while(<FD>) {$total++}
close(FD);

$quotient = int($total / $divisor);
$toomuch  = int($total % $divisor);

foreach $no (0..($divisor-1)) {
  my($file) = sprintf("%s%02d",$base,$no);
  if (!open(${$file},">$file")) {
    printf(STDERR "Can't open file[%s]\n",$file);
    exit(1);
  }
}

if (!open(FD,$input)) {
  printf(STDERR "Can't open file[%s]\n",$input);
  exit(1);
}
my($cnt) = 0;
while(<FD>) {
  $cnt++;
  if ($cnt > $c_end) {
    if($part < $toomuch)  {
      $c_start = $c_end + 1;
      $c_end   = $c_start + $quotient;
    }
    else {
      $c_start = $c_end + 1;
      $c_end   = $c_start + $quotient - 1;
    }
    printf("Part[%d] Start[%08d] End[%08d] Cnt[%08d]\n",
           $part + 1,$c_start,$c_end,$c_end - $c_start + 1); 
    $part++;
  }
  my($file) = sprintf("%s%02d",$base,$part-1);
  printf({${$file}} "%s",$_);
}

close(FD);

foreach $no (0..($divisor-1)) {
  my($file) = sprintf("%s%02d",$base,$no);
  close(${$file});
}


3.rubyで分割

#! /usr/bin/ruby

if (ARGV.length != 3)
  printf("Usage : split2.rb divisor input base\n")
  exit(1)
end

divisor  = ARGV[0].to_i
input    = ARGV[1]
base     = ARGV[2]

c_start  = 0
c_end    = 0
quotient = 0
toomuch  = 0
total    = 0
part     = 0
fplist   = []

f = File.open(input,"r");
f.each_line do |line|
  total += 1
end
f.close()

divisor.times do |no|
  fplist[no] = open("%s%02d"%[base,no],"w")  
end

quotient = (total / divisor).to_i
toomuch  = (total % divisor).to_i

cnt = 0
f = File.open(input,"r");
f.each_line do |line|
  cnt += 1
  if (cnt > c_end)
    if (part < toomuch) 
      c_start = c_end + 1
      c_end   = c_start + quotient
    else
      c_start = c_end + 1
      c_end   = c_start + quotient - 1
    end
    printf("Part[%d] Start[%08d] End[%08d] Cnt[%08d]\n",
           part + 1,c_start,c_end,c_end - c_start + 1)
    part += 1
  end
  fplist[part-1].printf("%s",line)
end
f.close()

divisor.times do |no|
  fplist[no].close
end

4.pythonで分割

#! /usr/bin/python

import sys

if len(sys.argv) != 4:
  print "Usage : split2.py divisor input base"
  sys.exit(1)

divisor  = int(sys.argv[1])
input    = sys.argv[2]
base     = sys.argv[3]

c_start  = 0
c_end    = 0
quotient = 0
toomuch  = 0
total    = 0
part     = 0
fplist   = []

f = open(input,"r")
for line in f:
  total += 1
f.close()

quotient = total / divisor
toomuch  = total % divisor

for no in range(0, divisor):
  f = open("%s%02d"%(base,no),"w")
  fplist.append(f)

cnt = 0
f = open(input,"r")
for line in f:
  cnt += 1
  if cnt > c_end:
    if part < toomuch:
      c_start = c_end + 1
      c_end   = c_start + quotient
    else:
      c_start = c_end + 1
      c_end   = c_start + quotient - 1
    print "Part[%d] Start[%08d] End[%08d] Cnt[%08d]" \
          %(part+1,c_start,c_end,c_end - c_start + 1)
    part += 1
  fplist[part-1].write("%s"%line)
f.close()

for out in fplist:
  out.close()

5.lispで分割

#! /usr/bin/clisp

(if (not (equal (length *args*) 3))
    (progn
      (format t "Usage split.lisp divisor input base")
      (exit 1)))

(setq divisor (parse-integer (car *args*)))
(setq input   (car (cdr *args*)))
(setq base    (caddr *args*))

(setq c_start  0)
(setq c_end    0)
(setq total    0)
(setq quotient 0)
(setq toomuch  0)
(setq part     0)
(setq fplist  '())

(let ((in (open input :direction :input)))
  (loop for line = (read-line in nil)
 while line do (setq total (+ total 1)))
  (close in))

(setq quotient (floor (/ total divisor)))
(setq toomuch  (floor (mod total divisor)))

;;
(dotimes (no divisor)
  (let ((out (open (format nil "~A~2,'0D" base no) :direction :output)))
    (push out fplist)))
;;    (format t "~A~2,'0D~%" base no)))
(nreverse fplist)

(let
    ((in (open input :direction :input))
     (cnt 0)
     (line ""))
  (loop
   (setq line (read-line in nil))
   (if (null line)
       (quit))
   (setq cnt (+ cnt 1))
   (if (> cnt c_end)
       (progn
  (if (< part toomuch)
      (progn
        (setq c_start (+ c_end 1))
        (setq c_end   (+ c_start quotient)))
    (progn
      (setq c_start (+ c_end 1))
      (setq c_end   (+ c_start quotient -1))))
  (setq part (+ part 1))
  (format t "Part[~D] Start[~8,'0D] End[~8,'0D] Cnt[~8,'0D]~%"
   part c_start c_end (+ (- c_end c_start) 1))))
   (format (elt fplist (- part 1)) "~A~%" line))
   (close in))
;;
(dotimes (no divisor)
  (close (pop fplist)))

6.javaで分割

import java.io.*;
import java.util.*;

class split2 {
  void split(int divisor, String input, String base) throws Exception {
    BufferedReader br = null;
    List<PrintWriter>fplist = null;
    String line = "";
    int total = 0;
    int c_start = 0;
    int c_end = 0;
    int part = 0;
    int quotient = 0;
    int toomuch = 0;
    int cnt =0;

    br = new BufferedReader(new FileReader(input));
    while((line=br.readLine())!=null) {
      total++;
    }
    br.close();

    fplist = new ArrayList<PrintWriter>();
    for(int no=0;no<divisor;no++) {
      String file = String.format("%s%02d",base,no);
      PrintWriter pw = new PrintWriter(
                           new BufferedWriter(new FileWriter(file)));
      fplist.add(pw);
    }

    quotient = total / divisor;
    toomuch  = total % divisor;

    cnt = 0;
    br = new BufferedReader(new FileReader(input));
    while((line=br.readLine())!=null) {
      cnt++;
      if (cnt > c_end) {
        if (part < toomuch) {
          c_start = c_end + 1;
          c_end   = c_start + quotient;
        } 
        else {
          c_start = c_end + 1;
          c_end   = c_start + quotient - 1;
        }
        System.out.printf("Part[%d] Start[%08d] End[%08d] Cnt[%08d]\n",
                          part+1,c_start,c_end,c_end-c_start+1);
        part++;
      }
      fplist.get(part-1).printf("%s\n",line);
    }
    br.close();

    for(int no=0;no<divisor;no++) {
      fplist.get(no).close();
    }
  } 
  
  public static void main(String args[]) throws Exception {
    if (args.length != 3) {
      System.out.printf("Usage : split2 divisor input base\n");
      System.exit(1);
    }

    int    divisor = Integer.parseInt(args[0]);
    String input   = args[1];
    String base    = args[2];

    split2 obj = new split2();
    obj.split(divisor,input,base);
  }
}

7.C言語で分割

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int
split(int divisor, char *input, char *base)
{
    FILE **fplist   = NULL;
    FILE  *fp       = NULL;
    int    c_start  = 0;
    int    c_end    = 0;
    int    quotient = 0;
    int    toomuch  = 0;
    int    total    = 0;
    int    part     = 0;
    int    cnt      = 0;
    int    no       = 0;
    char   buf[2048];

    fplist = calloc(sizeof(FILE *),divisor);
    if (!fplist) {
        fprintf(stderr,"calloc error!\n");
        return(1);
    }

    fp = fopen(input,"r");
    if (!fp) {
        fprintf(stderr,"Can't open file[%s]\n",input);
        return(1);
    }
    while(fgets(buf, sizeof(buf), fp)) {
        total++;
    }
    if (fclose(fp)) {
        fprintf(stderr,"Can't open file[%s]\n",input);
        return(1);
    }

    quotient = total / divisor;
    toomuch  = total % divisor;

    for(no=0;no<divisor;no++) {
        char file[128];
        sprintf(file,"%s%02d",base,no);
        fp = fopen(file,"w");
        if (!fp) {
            fprintf(stderr,"Can't open file[%s]\n",file);
            return(1);
        }
        fplist[no] = fp;
    }

    fp = fopen(input,"r");
    if (!fp) {
        fprintf(stderr,"Can't open file[%s]\n",input);
        return(1);
    }
    cnt = 0;
    while(fgets(buf, sizeof(buf), fp)) {
        cnt++;
        if (cnt > c_end) {
            if (part < toomuch) {
                c_start = c_end + 1;
                c_end   = c_start + quotient;
            }
            else {
                c_start = c_end + 1;
                c_end   = c_start + quotient - 1;
            }
            printf("Part[%d] Start[%08d] End[%08d] Cnt[%08d]\n",
                   part+1, c_start, c_end, c_end-c_start+1);
            part++;
        }
        if (!fprintf(fplist[part-1],"%s",buf)) {
            fprintf(stderr,"part [%d] write error\n",part);
            return(1);
        }
    }
    if (fclose(fp)) {
        fprintf(stderr,"Can't open file[%s]\n",input);
        return(1);
    }
    
    for(no=0;no<divisor;no++) {
        char file[128];
        sprintf(file,"%s%02d",base,no);
        if (fclose(fplist[no])) {
            fprintf(stderr,"Can't open file[%s]\n",file);
            return(1);
        }
    }
    
    return(0);
} 

int
main(int argc, char *argv[]) 
{
    int   divisor = 0;
    char *p       = NULL;
    char  input[128];
    char  base[128];

    if (argc != 4) {
        fprintf(stderr,"Usage : split2 divisor input base\n");
        exit(1);
    }
 
    for(p=argv[1];(*p)!='\0';p++) {
        if (!isdigit(*p)) {
            fprintf(stderr,"not a number! [%s]\n",argv[1]);
            exit(1);
        }
    }

    divisor = atoi(argv[1]);
    strcpy(input, argv[2]);
    strcpy(base, argv[3]);

    split(divisor, input, base);
}

8.速度比較

=== sh ===
Part[1] Start[00000001] End[00125000] Cnt[00125000]
Part[2] Start[00125001] End[00250000] Cnt[00125000]
Part[3] Start[00250001] End[00375000] Cnt[00125000]
Part[4] Start[00375001] End[00500000] Cnt[00125000]
Part[5] Start[00500001] End[00625000] Cnt[00125000]
Part[6] Start[00625001] End[00750000] Cnt[00125000]
Part[7] Start[00750001] End[00875000] Cnt[00125000]
Part[8] Start[00875001] End[01000000] Cnt[00125000]

real    0m2.696s
user    0m0.460s
sys     0m2.140s

=== perl ===
Part[1] Start[00000001] End[00125000] Cnt[00125000]
Part[2] Start[00125001] End[00250000] Cnt[00125000]
Part[3] Start[00250001] End[00375000] Cnt[00125000]
Part[4] Start[00375001] End[00500000] Cnt[00125000]
Part[5] Start[00500001] End[00625000] Cnt[00125000]
Part[6] Start[00625001] End[00750000] Cnt[00125000]
Part[7] Start[00750001] End[00875000] Cnt[00125000]
Part[8] Start[00875001] End[01000000] Cnt[00125000]

real    0m2.122s
user    0m1.736s
sys     0m0.216s

=== ruby ===
Part[1] Start[00000001] End[00125000] Cnt[00125000]
Part[2] Start[00125001] End[00250000] Cnt[00125000]
Part[3] Start[00250001] End[00375000] Cnt[00125000]
Part[4] Start[00375001] End[00500000] Cnt[00125000]
Part[5] Start[00500001] End[00625000] Cnt[00125000]
Part[6] Start[00625001] End[00750000] Cnt[00125000]
Part[7] Start[00750001] End[00875000] Cnt[00125000]
Part[8] Start[00875001] End[01000000] Cnt[00125000]

real    0m2.510s
user    0m2.048s
sys     0m0.256s

=== python ===
Part[1] Start[00000001] End[00125000] Cnt[00125000]
Part[2] Start[00125001] End[00250000] Cnt[00125000]
Part[3] Start[00250001] End[00375000] Cnt[00125000]
Part[4] Start[00375001] End[00500000] Cnt[00125000]
Part[5] Start[00500001] End[00625000] Cnt[00125000]
Part[6] Start[00625001] End[00750000] Cnt[00125000]
Part[7] Start[00750001] End[00875000] Cnt[00125000]
Part[8] Start[00875001] End[01000000] Cnt[00125000]

real    0m1.199s
user    0m0.832s
sys     0m0.228s

=== lisp ===
Part[1] Start[00000001] End[00125000] Cnt[00125000]
Part[2] Start[00125001] End[00250000] Cnt[00125000]
Part[3] Start[00250001] End[00375000] Cnt[00125000]
Part[4] Start[00375001] End[00500000] Cnt[00125000]
Part[5] Start[00500001] End[00625000] Cnt[00125000]
Part[6] Start[00625001] End[00750000] Cnt[00125000]
Part[7] Start[00750001] End[00875000] Cnt[00125000]
Part[8] Start[00875001] End[01000000] Cnt[00125000]

real    0m24.201s
user    0m21.632s
sys     0m2.012s

=== java ===
Part[1] Start[00000001] End[00125000] Cnt[00125000]
Part[2] Start[00125001] End[00250000] Cnt[00125000]
Part[3] Start[00250001] End[00375000] Cnt[00125000]
Part[4] Start[00375001] End[00500000] Cnt[00125000]
Part[5] Start[00500001] End[00625000] Cnt[00125000]
Part[6] Start[00625001] End[00750000] Cnt[00125000]
Part[7] Start[00750001] End[00875000] Cnt[00125000]
Part[8] Start[00875001] End[01000000] Cnt[00125000]

real    0m2.631s
user    0m2.936s
sys     0m0.468s

=== c ===
Part[1] Start[00000001] End[00125000] Cnt[00125000]
Part[2] Start[00125001] End[00250000] Cnt[00125000]
Part[3] Start[00250001] End[00375000] Cnt[00125000]
Part[4] Start[00375001] End[00500000] Cnt[00125000]
Part[5] Start[00500001] End[00625000] Cnt[00125000]
Part[6] Start[00625001] End[00750000] Cnt[00125000]
Part[7] Start[00750001] End[00875000] Cnt[00125000]
Part[8] Start[00875001] End[01000000] Cnt[00125000]

real    0m0.484s
user    0m0.236s
sys     0m0.224s

0 件のコメント:

コメントを投稿