/* sort - sort lines of text (with all kinds of options).
Copyright (C) 1988, 1991-2011 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see </licenses/>.
Written December 1988 by Mike Haertel.
The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
or (US mail) as Mike Haertel c/o Free Software Foundation.
Ørn E. Hansen added NLS support in 1997. */
#include <config.h>
#include <getopt.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
#include "system.h"
#include "argmatch.h"
#include "error.h"
#include "fadvise.h"
#include "filevercmp.h"
#include "hard-locale.h"
#include "hash.h"
#include "heap.h"
#include "ignore-value.h"
#include "md5.h"
#include "mbswidth.h"
#include "nproc.h"
#include "physmem.h"
#include "posixver.h"
#include "quote.h"
#include "quotearg.h"
#include "randread.h"
#include "readtokens0.h"
#include "stdio--.h"
#include "stdlib--.h"
#include "strnumcmp.h"
#include "xmemcoll.h"
#include "xnanosleep.h"
#include "xstrtol.h"
#if HAVE_SYS_RESOURCE_H
# include <sys/resource.h>
#endif
#ifndef RLIMIT_DATA
struct rlimit { size_t rlim_cur; };
# define getrlimit(Resource, Rlp) (-1)
#endif
/* The official name of this program (e.g., no `g' prefix). */
#define PROGRAM_NAME "sort"
#define AUTHORS \
proper_name ("Mike Haertel"), \
proper_name ("Paul Eggert")
#if HAVE_LANGINFO_CODESET
# include <langinfo.h>
#endif
/* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
present. */
#ifndef SA_NOCLDSTOP
# define SA_NOCLDSTOP 0
/* No sigprocmask. Always 'return' zero. */
# define sigprocmask(How, Set, Oset) (0)
# define sigset_t int
# if ! HAVE_SIGINTERRUPT
# define siginterrupt(sig, flag) /* empty */
# endif
#endif
#if !defined OPEN_MAX && defined NR_OPEN
# define OPEN_MAX NR_OPEN
#endif
#if !defined OPEN_MAX
# define OPEN_MAX 20
#endif
#define UCHAR_LIM (UCHAR_MAX + 1)
#if HAVE_C99_STRTOLD
# define long_double long double
#else
# define long_double double
# undef strtold
# define strtold strtod
#endif
#ifndef DEFAULT_TMPDIR
# define DEFAULT_TMPDIR "/tmp"
#endif
/* Maximum number of lines to merge every time a NODE is taken from
the merge queue. Node is at LEVEL in the binary merge tree,
and is responsible for merging TOTAL lines. */
#define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
/* Heuristic value for the numb
er of lines for which it is worth
sort of和kind of creating a subthread, during an internal merge sort, on a machine
that has processors galore. Currently this number is just a guess.
This value must be at least 4. We don't know of any machine where
this number has any practical effect. */
enum { SUBTHREAD_LINES_HEURISTIC = 4 };
/* The number of threads after which there are
diminishing performance gains. */
enum { DEFAULT_MAX_THREADS = 8 };
/* Exit statuses. */
enum
{
/* POSIX says to exit with status 1 if invoked with -c and the
input is not properly sorted. */
SORT_OUT_OF_ORDER = 1,
/* POSIX says any other irregular exit must exit with a status
code greater than 1. */
SORT_FAILURE = 2
};
enum
{
/* The number of times we should try to fork a compression process
(we retry if the fork call fails). We don't _need_ to compress
temp files, this is just to reduce disk access, so this number
can be small. Each retry doubles in duration. */
MAX_FORK_TRIES_COMPRESS = 4,
/* The number of times we should try to fork a decompression process.
If we can't fork a decompression process, we can't sort, so this
number should be big. Each retry doubles in duration. */
MAX_FORK_TRIES_DECOMPRESS = 9
};
enum
{
/* Level of the end-of-merge node, one level above the root. */
MERGE_END = 0,
/* Level of the root node in merge tree. */
MERGE_ROOT = 1
};
/
* The representation of the decimal point in the current locale. */
static int decimal_point;
/* Thousands separator; if -1, then there isn't one. */
static int thousands_sep;
/* Nonzero if the corresponding locales are hard. */
static bool hard_LC_COLLATE;
#if HAVE_NL_LANGINFO
static bool hard_LC_TIME;
#endif
#define NONZERO(x) ((x) != 0)
/* The kind of blanks for '-b' to skip in various options. */
enum blanktype { bl_start, bl_end, bl_both };
/* The character marking end of line. Default to \n. */
static char eolchar = '\n';
/* Lines are held in core as counted strings. */
struct line
{
char *text; /* Text of the line. */
size_t length; /* Length including final newline. */
char *keybeg; /* Start of first key. */
char *keylim; /* Limit of first key. */
};
/
* Input buffers. */
struct buffer
{
char *buf; /* Dynamically allocated buffer,
partitioned into 3 regions:
- input data;
- unused area;
- an array of lines, in reverse order. */
size_t used; /* Number of bytes used for input data. */
size_t nlines; /* Number of lines in the line array. */
size_t alloc; /* Number of bytes allocated. */
size_t left; /* Number of bytes left from previous reads. */
size_t line_bytes; /* Number of bytes to reserve for each line. */
bool eof; /* An EOF has been read. */
};
/* Sort key. */
struct keyfield
{
size_t swor
d; /* Zero-origin 'word' to start at. */
size_t schar; /* Additional characters to skip. */
size_t eword; /* Zero-origin last 'word' of key. */
size_t echar; /* Additional characters in field. */
bool const *ignore; /* Boolean array of characters to ignore. */
char const *translate; /* Translation applied to characters. */
bool skipsblanks; /* Skip leading blanks when finding start. */
bool skipeblanks; /* Skip leading blanks when finding end. */
bool numeric; /* Flag for numeric comparison. Handle
strings of digits with optional decimal
point, but no exponential notation. */
bool random; /* Sort by random hash of key. */
bool general_numeric; /* Flag for general, numeric comparison.
Handle numbers in exponential notation. */
bool human_numeric; /* Flag for sorting by human readable
units with either SI xor IEC prefixes. */
bool month; /* Flag for comparison by month name. */
bool reverse; /* Reverse the sense of comparison. */
bool version; /* sort by version number */
bool obsolete_used; /* obsolescent key option format is used. */
struct keyfield *next; /* Next keyfield to try. */
};
struct month
{
char const *name;
int val;
};
/* Binary merge tree node. */
struct merge_node
{
struct line *lo; /* Lines to merge from LO child node. */
struct line *hi; /* Lines to merge from HI child ndoe. */
struct line *end_lo; /* End of available lines from LO. */
struct line *end_hi; /* End of available lines from HI. */
struct line **dest; /* Pointer to destination of merge. */
size_t nlo; /* Total Lines remaining from LO. */
size_t nhi; /* Total lines remaining from HI. */
struct merge_node *parent; /* Parent node. */
struct merge_node *lo_child; /* LO child node. */
struct merge_node *hi_child; /* HI child node. */
unsigned int level; /* Level in merge tree. */
bool queued; /* Node is already in heap. */
pthread_mutex_t lock; /* Lock for node operations. */
};
/* Priority queue of merge nodes. */
struct merge_node_queue
{
struct heap *priority_queue; /* Priority queue of merge tree nodes. */
pthread_mutex_t mutex; /* Lock for queue operations. */
pthread_cond_t cond; /* Conditional wait for empty queue to populate
when popping. */
};
/* FIXME: None of these tables work with multibyte character sets.
Also, there are many other bugs when handling multibyte characters.
One way to fix this is to rewrite `sort' to use wide characters
internally, but doing this with good performance is a bit
tricky. */
/* Table of blanks. */
static bool blanks[UCHAR_LIM];
/* Table of non-printing characters. */
static bool nonprinting[UCHAR_LIM];
/* Table of non-dictionary characters (not letters, digits, or blanks). */
static bool nondictionary[UCHAR_LIM];
/* Translation table folding lower case to upper. */
static char fold_toupper[UCHAR_LIM];
#define MONTHS_PER_YEAR 12
/* Table mapping month names to integers.
Alphabetic order allows binary search. */
static struct month monthtab[] =
{
{"APR", 4},
{"AUG", 8},
{"DEC", 12},
{"FEB", 2},
{"JAN", 1},
{"JUL", 7},
{"JUN", 6},
{"MAR", 3},
{"MAY", 5},
{"NOV", 11},
{"OCT", 10},
{"SEP", 9}
};
/* During the merge phase, the number of files to merge at once. */
#define NMERGE_DEFAULT 16
/* Minimum size for a merge or check buffer. */
#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
/* Minimum sort size; the code might not work with smaller sizes. */
#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
/* The number of bytes needed for a merge or check buffer, which can
function relatively efficiently even if it holds only one line. If
a longer line is seen, this value is increased. */
static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
/* The approximate maximum number of bytes of main memory to use, as
specified by the user. Zero if the user has not specified a size. */
static size_t sort_size;
/* The guessed size for non-regular files. */
#define INPUT_FILE_SIZE_GUESS (1024 * 1024)
/* Array of directory names in which any temporary files are to be created. */
static char const **temp_dirs;
/* Number of temporary directory names used. */
static size_t temp_dir_count;
/
* Number of allocated slots in temp_dirs. */
static size_t temp_dir_alloc;
/* Flag to reverse the order of all comparisons. */
static bool reverse;
/* Flag for stable sort. This turns off the last ditch bytewise
comparison of lines, and instead leaves lines in the same order
they were read if all keys compare equal. */
static bool stable;
/* If TAB has this value, blanks separate fields. */
enum { TAB_DEFAULT = CHAR_MAX + 1 };
/* Tab character separating fields. If TAB_DEFAULT, then fields are
separated by the empty string between a non-blank character and a blank
character. */
static int tab = TAB_DEFAULT;
/* Flag to remove consecutive duplicate lines from the output.
Only the last of a sequence of equal lines will be output. */
static bool unique;
/* Nonzero if any of the input files are the standard input. */
static bool have_read_stdin;
/* List of key field comparisons to be tried. */
static struct keyfield *keylist;
/* Program used to (de)compress temp files. Must accept -d. */
static char const *compress_program;
/* Annotate the output with extra info to aid the user. */
static bool debug;
/* Maximum number of files to merge in one go. If more than this
number are present, temp files will be used. */
static unsigned int nmerge = NMERGE_DEFAULT;
/* Report MESSAGE for FILE, then clean up and exit.
If FILE is null, it represents standard output. */
static void die (char const *, char const *) ATTRIBUTE_NORETURN;
static void
die (char const *message, char const *file)
{
error (0, errno, "%s: %s", message, file ? file : _("standard output"));
exit (SORT_FAILURE);
}
void
usage (int status)
{
if (status != EXIT_SUCCESS)
fprintf (stderr, _("Try `%s --help' for more information.\n"),
program_name);
else
{
printf (_("\
Usage: %s [OPTION]... [FILE]...\n\
or: %s [OPTION]... --files0-from=F\n\
"),
program_name, program_name);
fputs (_("\
Write sorted concatenation of all FILE(s) to standard output.\n\
\n\
"), stdout);
fputs (_("\
Mandatory arguments to long options are mandatory for short options too.\n\
"), stdout);
fputs (_("\
Ordering options:\n\
\n\
"), stdout);
fputs (_("\
-b, --ignore-leading-blanks ignore leading blanks\n\
-d, --dictionary-order consider only blanks and alphanumeric characters\
\n\
-f, --ignore-case fold lower case to upper case characters\n\
"), stdout);
fputs (_("\
-g, --general-numeric-sort compare according to general numerical value\n\
-i, --ignore-nonprinting consider only printable characters\n\
-M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
"), stdout);
fputs (_("\
-h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
"), stdout);
fputs (_("\
-n, --numeric-sort compare according to string numerical value\n\
-R, --random-sort sort by random hash of keys\n\
--random-source=FILE get random bytes from FILE\n\
-r, --reverse reverse the result of comparisons\n\
"), stdout);
fputs (_("\
--sort=WORD sort according to WORD:\n\
general-numeric -g, human-numeric -h, month -M,\
\n\
numeric -n, random -R, version -V\n\
-V, --version-sort natural sort of (version) numbers within text\n\
\n\
"), stdout);
fputs (_("\
Other options:\n\
\n\
"), stdout);
fputs (_("\
--batch-size=NMERGE merge at most NMERGE inputs at once;\n\
for more use temp files\n\
"), stdout);
fputs (_("\
-c, --check, --check=diagnose-first check for sorted input; do not sort\n\
-C, --check=quiet, --check=silent like -c, but do not report first bad line\
\n\
--compress-program=PROG compress temporaries with PROG;\n\
decompress them with PROG -d\n\
"), stdout);
fputs (_("\
--debug annotate the part of the line used to sort,\n\
and warn about questionable usage to stderr\n\
--files0-from=F read input from the files specified by\n\
NUL-terminated names in file F;\n\
If F is - then read names from standard input\n\
"), stdout);
fputs (_("\
-k, --key=POS1[,POS2] start a key at
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论