I'm trying to implement a custom perl nagios script to check for rogue dhcp servers remotely with nrpe. On the central server when i run:
/usr/local/nagios/libexec/check_nrpe -H 10.9.0.25 -c check_roguedhcp
In my debugging logs i'm seeing this:
Host is asking for command 'check_roguedhcp' to be run...
Running command: sudo /usr/lib64/nagios/plugins/check_roguedhcp.pl
Command completed with return code 1 and output:
Return Code: 1, Output: NRPE: Unable to read output
Locally if i run the script (even as the nrpe user) I get the expected output.
On the local server my /etc/nagios/nrpe.cfg has the following settings:
command[check_roguedhcp]=sudo /usr/lib64/nagios/plugins/check_roguedhcp.pl
command[check_dhcp]=sudo /usr/lib64/nagios/plugins/check_dhcp -v
nrpe_user=nrpe
nrpe_group=nagios
ps aux shows nrpe is running as user nrpe (nrpe is in group nagios)
nrpe 5941 0.0 0.1 52804 2384 ? Ss 08:25 0:00 /usr/sbin/nrpe -c /etc/nagios/nrpe.cfg -d
I've added the command to /etc/sudoers
%nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios64/plugins/check_dhcp, /usr/lib64/nagios/plugins/check_roguedhcp.pl
on my central server that does the nrpe calls, i have the following service groups and configurations:
define servicegroup{
servicegroup_name rogue_dhcp
alias All dhcp monitors
}
define service{
name security-service
servicegroups rogue_dhcp
register 0
max_check_attempts 1
}
Nagios can run any other check_users etc script via nrpe on this server.
Here's the perl script itself, though we know that the file executes locally just fine.
1 #!/usr/bin/perl -w
2 # nagios: -epn
3 # the above makes nagios run the script separately.
4 use POSIX;
5 use lib "/usr/lib64/nagios/plugins";
6 use utils qw(%ERRORS);
7
8 sub fail_usage {
9 if (scalar @_) {
10 print "$0: error: \n";
11 map { print " $_\n"; } @_;
12 }
13 print "$0: Usage: \n";
14 print "$0 [-v [-v [-v]]] [ []] \n";
15 print "$0 [-v [-v [-v]]] [-s] [[-s] [[-s] ]] \n";
16 print " \n";
17 exit 3 ;
18 }
19
20 my $verbose = 0;
21 my %servers=(
22 "x", "10.x.x.x",
23 "x", "10.x.x.x",
24 "x", "10.x.x.x",
25 "x", "10.x.x.x"
26 );
27
28 # examine commandline args
29 while ($ARGV=$ARGV[0]) {
30 my $myarg = $ARGV;
31 if ($ARGV eq '-s') {
32 shift @ARGV;
33 if (!($ARGV = $ARGV[0])) { fail_usage ("$myarg needs an argument"); }
34 if ($ARGV =~ /^-/) { fail_usage ("$myarg must be followed by an argument"); }
35 if (!defined($servers{$ARGV})) { $servers{$ARGV}=1; }
36 }
37 elsif ($ARGV eq '-v' ) { $verbose++; }
38 elsif ($ARGV eq '-h' or $ARGV eq '--help' ) { fail_usage ; }
39 elsif ($ARGV =~ /^-/ ) { fail_usage " invalid option ($ARGV)"; }
40 elsif ($ARGV =~ /^\d+\.\d+\.\d+\.\d+$/)
41 # servers should be ip addresses. I'm not doing detailed checks for this.
42 { if (!defined($servers{$ARGV})) { $servers{$ARGV}=1; } }
43 else { last; }
44 shift @ARGV;
45 }
46 # for some reason I can't test for empty ARGs in the while loop
47 @ARGV = grep {!/^\s*$/} @ARGV;
48 if (scalar @ARGV) { fail_usage "didn't understand arguments: (".join (" ",@ARGV).")"; }
49
50 my $serversn = scalar keys %servers;
51
52 if ($verbose > 2) {
53 print "verbosity=($verbose)\n";
54 print "servers = ($serversn)\n";
55 if ($serversn) { for my $i (keys %servers) { print "server ($i)\n"; } }
56 }
57
58 if (!$serversn) { fail_usage "no servers"; }
59 my $responses=0;
60 my $responders="";
61 my @check_dhcp = qx{/usr/lib64/nagios/plugins/check_dhcp -v};
62 foreach my $value (@check_dhcp) {
63 if ($value =~ /Added offer from server \@ /i){
64 $value =~ m/(\d+\.\d+\.\d+\.\d+)/i;
65 my $host = $1;
66 # we find a server in our list
67 if (defined($servers{$host})) { $responses++; $responders.="$host "; }
68 # we find a rogue DHCP server. Danger Will Robinson!
69 else {
70 print "DHCP:CRITICAL: DHCP service running on $host";
71 exit $ERRORS{'OK'}
72 }
73 }
74 }
75 # we saw all the servers in our list. All is good.
76 if ($responses == $serversn) {
77 print "DHCP:OK: $responses of $serversn Expected Responses to DHCP Broadcast";
78 exit $ERRORS{'OK'};
79 }
80 # we found no DHCP responses.
81 if ($responses == 0) {
82 print "DHCP:OK: no rogue servers detected!!!!#!@#";
83 exit $ERRORS{'OK'}
84 }
85 # we found less DHCP servers than we should have. Oh Nos!
86 $responders =~ s/ $//;
87 print "DHCP:OK: $responses of $serversn Responses to DHCP Broadcast. ($responders) responded. ";
88 exit $ERRORS{'OK'};
Here's what I am seeing (of relevance) when I do an strace of the nrpe process.
955 6950 stat("/usr/lib64/nagios/plugins/check_roguedhcp.pl", {st_mode=S_IFREG|S_ISUID|S_ISGID|0755, st_size=2799, ...}) = 0
956 6950 setresuid(4294967295, 4294967295, 4294967295) = 0
957 6950 setresgid(4294967295, 536347864, 4294967295) = 0
958 6950 setgroups(3, [536347864, 536347137, 536353632]) = 0
959 6950 open("/dev/tty", O_RDWR|O_NOCTTY) = -1 ENXIO (No such device or address)
960 6950 socket(PF_NETLINK, SOCK_RAW, 9) = 3
961 6950 fcntl(3, F_SETFD, FD_CLOEXEC) = 0
962 6950 fcntl(3, F_SETFD, FD_CLOEXEC) = 0
963 6950 ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7fff3de81ac0) = -1 ENOTTY (Inappropriate ioctl for device)
964 6950 ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7fff3de81ac0) = -1 EINVAL (Invalid argument)
965 6950 ioctl(2, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7fff3de81ac0) = -1 ENOTTY (Inappropriate ioctl for device)
966 6950 getcwd("/", 4096) = 2
967 6950 sendto(3, "d\0\0\0c\4\5\0\1\0\0\0\0\0\0\0cwd=\"/\" cmd=\"/us"..., 100, 0, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 100
968 6950 poll([{fd=3, events=POLLIN}], 1, 500) = 1 ([{fd=3, revents=POLLIN}])
969 6950 recvfrom(3, "$\0\0\0\2\0\0\0\1\0\0\0&\33\0\0\0\0\0\0d\0\0\0c\4\5\0\1\0\0\0"..., 8988, MSG_PEEK|MSG_DONTWAIT, {sa_family=AF_NE TLINK, pid=0, groups=00000000}, [12]) = 36
970 6950 recvfrom(3, "$\0\0\0\2\0\0\0\1\0\0\0&\33\0\0\0\0\0\0d\0\0\0c\4\5\0\1\0\0\0"..., 8988, MSG_DONTWAIT, {sa_family=AF_NETLINK, pi d=0, groups=00000000}, [12]) = 36
971 6950 write(2, "sudo", 4) = 4
972 6950 write(2, ": ", 2) = 2
973 6950 write(2, "sorry, you must have a tty to ru"..., 38) = 38
974 6950 write(2, "\n", 1) = 1
975 6950 setresuid(4294967295, 4294967295, 4294967295) = 0
976 6950 setresgid(4294967295, 4294967295, 4294967295) = 0
977 6950 exit_group(1) = ?
978 6949 <... read resumed> "", 4096) = 0
979 6949 --- SIGCHLD (Child exited) @ 0 (0) ---
980 6949 close(5) = 0
981 6949 wait4(6950, [{WIFEXITED(s) && WEXITSTATUS(s) == 1}], 0, NULL) = 6950
970 6950 recvfrom(3, "$\0\0\0\2\0\0\0\1\0\0\0&\33\0\0\0\0\0\0d\0\0\0c\4\5\0\1\0\0\0"..., 8988, MSG_DONTWAIT, {sa_family=AF_NETLINK, pi d=0, groups=00000000}, [12]) = 36
971 6950 write(2, "sudo", 4) = 4
972 6950 write(2, ": ", 2) = 2
973 6950 write(2, "sorry, you must have a tty to ru"..., 38) = 38
974 6950 write(2, "\n", 1) = 1
975 6950 setresuid(4294967295, 4294967295, 4294967295) = 0
976 6950 setresgid(4294967295, 4294967295, 4294967295) = 0
977 6950 exit_group(1) = ?
This was solved by adding the following to /etc/sudoers
Defaults:nagios !requiretty