Categories
SysOps

How to perform basic HDFS operations

Let’s perform basic HDFS operations.

List local files.

$ ls -l -h ~/log/
total 488K
-rw-r--r-- 1 hadoop hadoop  22K May 13 20:11 alternatives.log
drwxr-xr-x 2 hadoop hadoop 4.0K May 13 20:11 apt
-rw-r----- 1 hadoop hadoop  33K May 13 20:11 auth.log
-rw-r----- 1 hadoop hadoop    0 May 13 20:11 btmp
-rw-r----- 1 hadoop hadoop  16K May 13 20:11 daemon.log
-rw-r--r-- 1 hadoop hadoop 256K May 13 20:11 dpkg.log
-rw-r--r-- 1 hadoop hadoop  26K May 13 20:11 faillog
-rw-r--r-- 1 hadoop hadoop 229K May 13 20:11 lastlog
-rw-r----- 1 hadoop hadoop  610 May 13 20:11 mail.info
-rw-r----- 1 hadoop hadoop  610 May 13 20:11 mail.log
-rw-r----- 1 hadoop hadoop  230 May 13 20:11 mail.warn
-rw-r----- 1 hadoop hadoop  294 May 13 20:11 messages
-rw-r--r-- 1 hadoop hadoop  42K May 13 20:11 syslog
-rw-r--r-- 1 hadoop hadoop 3.6K May 13 20:11 syslog.1
-rw-r--r-- 1 hadoop hadoop 2.3K May 13 20:11 syslog.2.gz
-rw-r----- 1 hadoop hadoop  39K May 13 20:11 user.log

Create a directory and make parent directories as needed.

$ hdfs dfs -mkdir -p /logs/system/local/

Copy files from local to the remote directory without creating intermediary files and overwrite the destination if it exists.

$ hdfs dfs -copyFromLocal -d -f  ~/log/ /logs/system/local/

List uploaded files.

$ hdfs dfs -ls /logs/system/local/
Found 16 items
-rw-r--r--   3 hadoop supergroup      21944 2021-05-13 20:32 /logs/system/local/alternatives.log
drwxr-xr-x   - hadoop supergroup          0 2021-05-13 20:32 /logs/system/local/apt
-rw-r--r--   3 hadoop supergroup      33398 2021-05-13 20:32 /logs/system/local/auth.log
-rw-r--r--   3 hadoop supergroup          0 2021-05-13 20:32 /logs/system/local/btmp
-rw-r--r--   3 hadoop supergroup      15806 2021-05-13 20:32 /logs/system/local/daemon.log
-rw-r--r--   3 hadoop supergroup     261722 2021-05-13 20:32 /logs/system/local/dpkg.log
-rw-r--r--   3 hadoop supergroup      25632 2021-05-13 20:32 /logs/system/local/faillog
-rw-r--r--   3 hadoop supergroup     233892 2021-05-13 20:32 /logs/system/local/lastlog
-rw-r--r--   3 hadoop supergroup        610 2021-05-13 20:32 /logs/system/local/mail.info
-rw-r--r--   3 hadoop supergroup        610 2021-05-13 20:32 /logs/system/local/mail.log
-rw-r--r--   3 hadoop supergroup        230 2021-05-13 20:32 /logs/system/local/mail.warn
-rw-r--r--   3 hadoop supergroup        294 2021-05-13 20:32 /logs/system/local/messages
-rw-r--r--   3 hadoop supergroup      42444 2021-05-13 20:32 /logs/system/local/syslog
-rw-r--r--   3 hadoop supergroup       3664 2021-05-13 20:32 /logs/system/local/syslog.1
-rw-r--r--   3 hadoop supergroup       2301 2021-05-13 20:32 /logs/system/local/syslog.2.gz
-rw-r--r--   3 hadoop supergroup      39320 2021-05-13 20:32 /logs/system/local/user.log

Count the number of directories, file and display additional information.

$ hdfs dfs -count -v -q  -h /logs/system/local/
       QUOTA       REM_QUOTA     SPACE_QUOTA REM_SPACE_QUOTA    DIR_COUNT   FILE_COUNT       CONTENT_SIZE PATHNAME
        none             inf            none             inf            2           18            719.5 K /logs/system/local

Look for specific files.

$ hdfs dfs -find /logs -iname mail.* -print
/logs/system/local/mail.info
/logs/system/local/mail.log
/logs/system/local/mail.warn

Display file checksum.

$ hdfs dfs -checksum  /logs/system/local/auth.log
/logs/system/local/auth.log     MD5-of-0MD5-of-512CRC32C        000002000000000000000000b08bf0120a571c71e625043bc0cb0a8b

Display file contents.

$ hdfs dfs -cat /logs/system/local/mail.log
May 10 22:35:00 client postfix/postfix-script[242]: warning: symlink leaves directory: /etc/postfix/./makedefs.out
May 10 22:35:00 client postfix/postfix-script[279]: starting the Postfix mail system
May 10 22:35:00 client postfix/master[281]: daemon started -- version 3.4.14, configuration /etc/postfix
May 10 22:53:02 client postfix/postfix-script[229]: warning: symlink leaves directory: /etc/postfix/./makedefs.out
May 10 22:53:02 client postfix/postfix-script[266]: starting the Postfix mail system
May 10 22:53:02 client postfix/master[268]: daemon started -- version 3.4.14, configuration /etc/postfix

Display file contents as plain text including gzipped files.

$ hdfs dfs -text /logs/system/local/syslog.2.gz
May 10 22:34:57 client systemd[1]: Starting Flush Journal to Persistent Storage...
May 10 22:34:57 client systemd-sysusers[51]: Creating group systemd-coredump with gid 999.
May 10 22:34:57 client systemd-sysusers[51]: Creating user systemd-coredump (systemd Core Dumper) with uid 999 and gid 999.
[...]
May 11 23:56:44 client systemd[1]: apt-daily.service: Succeeded.
May 11 23:56:44 client systemd[1]: Started Daily apt download activities.
May 12 00:00:02 client systemd[1]: Starting Rotate log files...
May 12 00:00:02 client systemd[1]: Starting Daily man-db regeneration...

Display the first kilobyte of the file.

$ hdfs dfs -head  /logs/system/local/syslog
May 13 00:00:02 client rsyslogd:  [origin software="rsyslogd" swVersion="8.1901.0" x-pid="77" x-info="https://www.rsyslog.com"] rsyslogd was HUPed
May 13 00:00:03 client systemd[1]: logrotate.service: Succeeded.
May 13 00:00:03 client systemd[1]: Started Rotate log files.
May 13 00:00:03 client systemd[1]: man-db.service: Succeeded.
May 13 00:00:03 client systemd[1]: Started Daily man-db regeneration.
May 13 00:16:01 client CRON[12225]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
May 13 01:16:01 client CRON[12229]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
May 13 02:16:01 client CRON[12232]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
May 13 03:16:01 client CRON[12236]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
May 13 04:16:01 client CRON[12240]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
May 13 05:16:01 client CRON[12243]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)

Display the last kilobyte of the file.

$ hdfs dfs -tail  /logs/system/local/syslog
 an error! possibly I die! 
May 13 19:05:29 client vim: *** err 
May 13 19:05:29 client vim: unable to open gpm console, check your /dev filesystem!
May 13 19:05:29 client vim: *** err 
May 13 19:05:29 client vim: Oh, oh, it's an error! possibly I die! 
May 13 19:05:29 client vim: *** err 
May 13 19:05:29 client vim: unable to open gpm console, check your /dev filesystem!
May 13 19:05:29 client vim: *** err 
May 13 19:05:29 client vim: Oh, oh, it's an error! possibly I die! 
May 13 19:05:30 client vim: *** err 
May 13 19:05:30 client vim: unable to open gpm console, check your /dev filesystem!
May 13 19:05:30 client vim: *** err 
May 13 19:05:30 client vim: Oh, oh, it's an error! possibly I die! 
May 13 19:16:01 client CRON[16457]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
May 13 20:08:30 client mc: *** err 
May 13 20:08:30 client mc: unable to open gpm console, check your /dev filesystem!
May 13 20:08:30 client mc: *** err 
May 13 20:08:30 client mc: Oh, oh, it's an error! possibly I die! 

Check if a directory exists.

$ hdfs dfs -test -d /logs/system/local/dpkg  
$ echo $?
1

Check if a file exists.

$ hdfs dfs -test -d /logs/system/local/messages  
$ echo $?
0

Display requested file details.

$ hdfs dfs -stat "type:%F perm:%a (%A) user:%u group:%g size:%b replicas:%r mtime:%y (%Y) atime:%x (%X) name:%n "  /logs/system/local/messages
type:regular file perm:644 (rw-r--r--) user:hadoop group:supergroup size:294 replicas:3 mtime:2021-05-13 20:32:34 (1620937954899) atime:2021-05-13 20:32:34 (1620937954872) name:messages

Truncate a file.

$ hdfs dfs -truncate 0 /logs/system/local/auth.log
Truncated /logs/system/local/auth.log to length: 0

Update access, modification time or create an empty file.

$ hdfs dfs -touch /logs/system/local/.auth.log.read

Recursively delete files and directories.

$ hdfs dfs -rm -r /logs/system/local/apt/*
Deleted /logs/system/local/apt/eipp.log.xz
Deleted /logs/system/local/apt/history.log
Deleted /logs/system/local/apt/term.log

Delete an empty directory.

$ hdfs dfs -rmdir /logs/system/local/apt

Recursively change replication factor.

$ hdfs dfs -setrep 1 /logs/system/local
Replication 1 set: /logs/system/local/.auth.log.read
Replication 1 set: /logs/system/local/alternatives.log
Replication 1 set: /logs/system/local/auth.log
Replication 1 set: /logs/system/local/btmp
Replication 1 set: /logs/system/local/daemon.log
Replication 1 set: /logs/system/local/dpkg.log
Replication 1 set: /logs/system/local/faillog
Replication 1 set: /logs/system/local/lastlog
Replication 1 set: /logs/system/local/mail.info
Replication 1 set: /logs/system/local/mail.log
Replication 1 set: /logs/system/local/mail.warn
Replication 1 set: /logs/system/local/messages
Replication 1 set: /logs/system/local/syslog
Replication 1 set: /logs/system/local/syslog.1
Replication 1 set: /logs/system/local/syslog.2.gz
Replication 1 set: /logs/system/local/user.log

Merge multiple files and locally store the resulting file.

$ hdfs dfs -getmerge /logs/system/local/mail.* mail
$ cat mail
May 10 22:35:00 client postfix/postfix-script[242]: warning: symlink leaves directory: /etc/postfix/./makedefs.out
May 10 22:35:00 client postfix/postfix-script[279]: starting the Postfix mail system
May 10 22:35:00 client postfix/master[281]: daemon started -- version 3.4.14, configuration /etc/postfix
May 10 22:53:02 client postfix/postfix-script[229]: warning: symlink leaves directory: /etc/postfix/./makedefs.out
May 10 22:53:02 client postfix/postfix-script[266]: starting the Postfix mail system
May 10 22:53:02 client postfix/master[268]: daemon started -- version 3.4.14, configuration /etc/postfix
May 10 22:35:00 client postfix/postfix-script[242]: warning: symlink leaves directory: /etc/postfix/./makedefs.out
May 10 22:35:00 client postfix/postfix-script[279]: starting the Postfix mail system
May 10 22:35:00 client postfix/master[281]: daemon started -- version 3.4.14, configuration /etc/postfix
May 10 22:53:02 client postfix/postfix-script[229]: warning: symlink leaves directory: /etc/postfix/./makedefs.out
May 10 22:53:02 client postfix/postfix-script[266]: starting the Postfix mail system
May 10 22:53:02 client postfix/master[268]: daemon started -- version 3.4.14, configuration /etc/postfix
May 10 22:35:00 client postfix/postfix-script[242]: warning: symlink leaves directory: /etc/postfix/./makedefs.out
May 10 22:53:02 client postfix/postfix-script[229]: warning: symlink leaves directory: /etc/postfix/./makedefs.out

Displays free space.

$ hdfs dfs -df -h
Filesystem                          Size     Used  Available  Use%
hdfs://namenode.example.org:9000  58.7 G  935.6 K     48.8 G    0%

Displays sizes of files and directories.

$ hdfs dfs -copyFromLocal -d -f  ~/log/ /logs/system/local/
$ hdfs dfs -du -v -h  /logs/system/local/
SIZE     DISK_SPACE_CONSUMED_WITH_ALL_REPLICAS  FULL_PATH_NAME
0        0                                      /logs/system/local/.auth.log.read
21.4 K   64.3 K                                 /logs/system/local/alternatives.log
53.6 K   160.9 K                                /logs/system/local/apt
32.6 K   97.8 K                                 /logs/system/local/auth.log
0        0                                      /logs/system/local/btmp
15.4 K   46.3 K                                 /logs/system/local/daemon.log
255.6 K  766.8 K                                /logs/system/local/dpkg.log
25.0 K   75.1 K                                 /logs/system/local/faillog
228.4 K  685.2 K                                /logs/system/local/lastlog
610      1.8 K                                  /logs/system/local/mail.info
610      1.8 K                                  /logs/system/local/mail.log
230      690                                    /logs/system/local/mail.warn
294      882                                    /logs/system/local/messages
41.4 K   124.3 K                                /logs/system/local/syslog
3.6 K    10.7 K                                 /logs/system/local/syslog.1
2.2 K    6.7 K                                  /logs/system/local/syslog.2.gz
38.4 K   115.2 K                                /logs/system/local/user.log

Copy a file to the local filesystem.

$ hdfs dfs -copyToLocal  /logs/system/local/user.log  ~/user.log

Upload a single file and overwrite the destination.

$ hdfs dfs -put -f ~/log/syslog  /logs/system/local/syslog

Upload a single file and overwrite the destination.

$ hdfs dfs -get -f /logs/system/local/syslog syslog

Move a file or directory.

$ hdfs dfs -mv /logs/system/local /logs/system/localhost

Copy a file or directory.

$ hdfs dfs -cp /logs/system/localhost /logs/backup