Discussion:
[ceph-users] PG problem after reweight (1 PG active+remapped)
Athanasios Panterlis
2018-12-03 13:51:07 UTC
Permalink
Hi all,

I am managing a typical small ceph cluster that consists of 4 nodes with each one having 7 OSDs (some in hdd pool, some in ssd pool)

Having a healthy cluster and following some space issues due to bad pg management from ceph, I tried some reweighs in specific OSDs. Unfortunately the reballancing after reweigh a specific pg went to active+remmaped state and I have now a couple of misplaced objects (3359/6118678 objects misplaced (0.055%))

Cluster had (for iops reasons) the scrub and deep-scrub options disabled. I enabled a simple scrub to all PG and waited to run. It finished completely without problems I guess. I would like to avoid deep-scrub, but if this is going to help I will run it once.

I am thinking of declaring OSD 26 as lost. This way, new pg copy from 6 will be created, correct?
Any other less harmful thoughts on how to fix it?

I attached all the information I could provide. I am also pasting them raw below:

Query for faulty PG:

{
"state": "active+remapped",
"snap_trimq": "[]",
"epoch": 11755,
"up": [
6
],
"acting": [
6,
26
],
"actingbackfill": [
"6",
"26"
],
"info": {
"pgid": "1.11d",
"last_update": "11755'60561210",
"last_complete": "11755'60561210",
"log_tail": "11755'60558123",
"last_user_version": 60561210,
"last_backfill": "MAX",
"purged_snaps": "[1~33,36~22]",
"history": {
"epoch_created": 31,
"last_epoch_started": 11681,
"last_epoch_clean": 11681,
"last_epoch_split": 0,
"same_up_since": 11679,
"same_interval_since": 11680,
"same_primary_since": 11510,
"last_scrub": "449'16483",
"last_scrub_stamp": "2016-09-14 15:25:14.228231",
"last_deep_scrub": "448'16277",
"last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
"last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
},
"stats": {
"version": "11755'60561210",
"reported_seq": "50924585",
"reported_epoch": "11755",
"state": "active+remapped",
"last_fresh": "2018-12-03 12:58:03.289251",
"last_change": "2018-11-09 10:54:06.861873",
"last_active": "2018-12-03 12:58:03.289251",
"last_peered": "2018-12-03 12:58:03.289251",
"last_clean": "2018-11-09 10:54:02.622866",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2018-12-03 12:58:03.289251",
"last_undegraded": "2018-12-03 12:58:03.289251",
"last_fullsized": "2018-12-03 12:58:03.289251",
"mapping_epoch": 11679,
"log_start": "11755'60558123",
"ondisk_log_start": "11755'60558123",
"created": 31,
"last_epoch_clean": 11681,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "449'16483",
"last_scrub_stamp": "2016-09-14 15:25:14.228231",
"last_deep_scrub": "448'16277",
"last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
"last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
"log_size": 3087,
"ondisk_log_size": 3087,
"stats_invalid": "0",
"stat_sum": {
"num_bytes": 14031434258,
"num_objects": 3359,
"num_object_clones": 0,
"num_object_copies": 6718,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 3359,
"num_objects_unfound": 0,
"num_objects_dirty": 3359,
"num_whiteouts": 0,
"num_read": 27359423,
"num_read_kb": 1815932413,
"num_write": 121113356,
"num_write_kb": 2124776643,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 65218,
"num_bytes_recovered": 271765903872,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0
},
"up": [
6
],
"acting": [
6,
26
],
"blocked_by": [],
"up_primary": 6,
"acting_primary": 6
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 11681,
"hit_set_history": {
"current_last_update": "0'0",
"current_last_stamp": "0.000000",
"current_info": {
"begin": "0.000000",
"end": "0.000000",
"version": "0'0",
"using_gmt": "1"
},
"history": []
}
},
"peer_info": [
{
"peer": "26",
"pgid": "1.11d",
"last_update": "11755'60561210",
"last_complete": "11755'60561210",
"log_tail": "11649'58446601",
"last_user_version": 58449647,
"last_backfill": "MAX",
"purged_snaps": "[1~33,36~22]",
"history": {
"epoch_created": 31,
"last_epoch_started": 11681,
"last_epoch_clean": 11681,
"last_epoch_split": 0,
"same_up_since": 11679,
"same_interval_since": 11680,
"same_primary_since": 11510,
"last_scrub": "449'16483",
"last_scrub_stamp": "2016-09-14 15:25:14.228231",
"last_deep_scrub": "448'16277",
"last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
"last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
},
"stats": {
"version": "11678'58449646",
"reported_seq": "48950066",
"reported_epoch": "11678",
"state": "active+clean",
"last_fresh": "2018-11-09 10:54:02.263168",
"last_change": "2018-11-09 08:01:12.116827",
"last_active": "2018-11-09 10:54:02.263168",
"last_peered": "2018-11-09 10:54:02.263168",
"last_clean": "2018-11-09 10:54:02.263168",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2018-11-09 10:54:02.263168",
"last_undegraded": "2018-11-09 10:54:02.263168",
"last_fullsized": "2018-11-09 10:54:02.263168",
"mapping_epoch": 11679,
"log_start": "11649'58446601",
"ondisk_log_start": "11649'58446601",
"created": 31,
"last_epoch_clean": 11610,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "449'16483",
"last_scrub_stamp": "2016-09-14 15:25:14.228231",
"last_deep_scrub": "448'16277",
"last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
"last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
"log_size": 3045,
"ondisk_log_size": 3045,
"stats_invalid": "0",
"stat_sum": {
"num_bytes": 18153595392,
"num_objects": 4344,
"num_object_clones": 0,
"num_object_copies": 8688,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 4344,
"num_whiteouts": 0,
"num_read": 26674601,
"num_read_kb": 1767105243,
"num_write": 116892449,
"num_write_kb": 2073693377,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 65218,
"num_bytes_recovered": 271765903872,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0
},
"up": [
6
],
"acting": [
6,
26
],
"blocked_by": [],
"up_primary": 6,
"acting_primary": 6
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 11681,
"hit_set_history": {
"current_last_update": "0'0",
"current_last_stamp": "0.000000",
"current_info": {
"begin": "0.000000",
"end": "0.000000",
"version": "0'0",
"using_gmt": "1"
},
"history": []
}
}
],
"recovery_state": [
{
"name": "Started\/Primary\/Active",
"enter_time": "2018-11-09 10:54:06.825830",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "-1\/0\/\/0",
"backfill_info": {
"begin": "-1\/0\/\/0",
"end": "-1\/0\/\/0",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "0",
"scrubber.active": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2018-11-09 10:54:05.789621"
}
],
"agent_state": {}
}

Ceph status

health HEALTH_WARN
1 pgs stuck unclean
recovery 3359/6118678 objects misplaced (0.055%)
noout,nodeep-scrub flag(s) set
monmap e3: 3 mons at {0=192.168.1.1:6789/0,1=192.168.1.2:6789/0,2=192.168.1.3:6789/0}
election epoch 4882, quorum 0,1,2 0,1,2
osdmap e11755: 27 osds: 27 up, 27 in; 1 remapped pgs
flags noout,nodeep-scrub
pgmap v62734988: 1024 pgs, 2 pools, 10183 GB data, 2557 kobjects
23768 GB used, 48720 GB / 72488 GB avail
3359/6118678 objects misplaced (0.055%)
1023 active+clean
1 active+remapped
client io 141 kB/s rd, 14068 kB/s wr, 925 op/s

2018-12-03 12:58:52.109913 mon.0 [INF] pgmap v62734987: 1024 pgs: 1 active+remapped, 1023 active+clean; 10183 GB data, 23768 GB used, 48720 GB / 72488 GB avail; 8325 kB/s rd, 16182 kB/s wr, 1704 op/s; 3359/6118678 objects misplaced (0.055%)

OSD tree

ID WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR TYPE NAME
-11 37.19995 - 37204G 10257G 26946G 27.57 0.84 root hdd
-12 9.29999 - 9301G 2531G 6769G 27.22 0.83 host hdd-node1
18 4.64999 1.00000 4650G 1226G 3424G 26.37 0.80 osd.18
19 4.64999 1.00000 4650G 1305G 3345G 28.06 0.86 osd.19
-13 9.29999 - 9301G 2665G 6635G 28.66 0.87 host hdd-node2
20 4.64999 1.00000 4650G 1361G 3289G 29.27 0.89 osd.20
21 4.64999 1.00000 4650G 1304G 3346G 28.05 0.86 osd.21
-14 9.29999 - 9301G 2628G 6672G 28.26 0.86 host hdd-node3
22 4.64999 1.00000 4650G 1396G 3254G 30.02 0.92 osd.22
23 4.64999 1.00000 4650G 1232G 3418G 26.50 0.81 osd.23
-15 9.29999 - 9301G 2431G 6869G 26.15 0.80 host hdd-node4
24 4.64999 1.00000 4650G 1218G 3432G 26.20 0.80 osd.24
25 4.64999 1.00000 4650G 1213G 3436G 26.09 0.80 osd.25
-1 35.14995 - 35284G 13512G 21771G 38.30 1.17 root default
-2 9.25000 - 9285G 3431G 5853G 36.96 1.13 host node1
0 1.84999 1.00000 1857G 765G 1091G 41.24 1.26 osd.0
1 1.84999 1.00000 1857G 633G 1224G 34.09 1.04 osd.1
6 1.84999 1.00000 1857G 777G 1079G 41.88 1.28 osd.6
7 1.84999 0.89999 1857G 752G 1104G 40.54 1.24 osd.7
8 1.84999 1.00000 1857G 502G 1354G 27.06 0.83 osd.8
-3 9.24995 - 9285G 3562G 5722G 38.37 1.17 host node2
2 1.84999 1.00000 1857G 766G 1090G 41.27 1.26 osd.2
3 1.84999 0.70000 1857G 674G 1182G 36.33 1.11 osd.3
9 1.84999 1.00000 1857G 580G 1276G 31.28 0.95 osd.9
10 1.84999 1.00000 1857G 814G 1042G 43.88 1.34 osd.10
11 1.84999 1.00000 1857G 725G 1131G 39.07 1.19 osd.11
-4 9.25000 - 9285G 3561G 5724G 38.35 1.17 host node3
4 1.84999 1.00000 1857G 684G 1172G 36.88 1.12 osd.4
5 1.84999 1.00000 1857G 633G 1223G 34.11 1.04 osd.5
12 1.84999 1.00000 1857G 696G 1160G 37.49 1.14 osd.12
13 1.84999 0.70000 1857G 741G 1116G 39.90 1.22 osd.13
14 1.84999 0.89999 1857G 805G 1051G 43.37 1.32 osd.14
-5 7.39999 - 7428G 2957G 4470G 39.81 1.21 host node4
15 1.84999 0.79999 1857G 742G 1115G 39.96 1.22 osd.15
16 1.84999 1.00000 1857G 634G 1222G 34.15 1.04 osd.16
17 1.84999 0.89999 1857G 803G 1053G 43.26 1.32 osd.17
26 1.84999 0.81000 1857G 777G 1079G 41.89 1.28 osd.26
TOTAL 72488G 23770G 48718G 32.79
MIN/MAX VAR: 0.80/1.34 STDDEV: 6.56

PG dump

version 62735224
stamp 2018-12-03 13:02:52.799643
last_osdmap_epoch 11755
last_pg_scan 9537
full_ratio 0.95
nearfull_ratio 0.85
pg_stat objects mip degr misp unf bytes log disklog state state_stamp v reported up up_primary acting acting_primary last_scrub scrub_stamp last_deep_scrub deep_scrub_stamp

///active+clean ones removed///

1.11d 3359 0 0 3359 0 14031434258 3034 3034 active+remapped 2018-11-09 10:54:06.861873 11755'60561357 11755:50924695 [6] 6 [6,26] 6 449'16483 2016-09-14 15:25:14.228231 448'16277 2016-09-13 06:11:45.633007

///active+clean ones removed///

pool 1 1738101 0 0 3359 0 7253679601802 1562466 1562466
pool 2 881071 0 0 0 0 3682217717410 1561924 1561924
sum 2619172 0 0 3359 0 10935897319212 3124390 3124390
osdstat kbused kbavail kb hb in hb out
0 803034868 1144251736 1947286604 [1,2,3,4,5,10,12,13,14,15,16,26] []
1 663754736 1283531868 1947286604 [0,2,3,5,10,12,13,15,16,17,26] []
2 803619260 1143667344 1947286604 [0,1,3,4,5,6,7,8,13,14,15,16,17,26] []
3 707438640 1239847964 1947286604 [0,1,2,4,5,6,7,8,12,13,14,15,17] []
4 718194072 1229092532 1947286604 [2,3,5,6,8,10,11,15,16,17,26] []
5 664279112 1283007492 1947286604 [0,1,2,3,4,6,8,10,11,15,17,26] []
6 815455088 1131831516 1947286604 [2,4,5,7,9,10,11,12,13,14,15,16,17,26] []
7 789396940 1157889664 1947286604 [2,4,6,8,9,10,11,12,13,14,15,16,17,26] []
8 526871252 1420415352 1947286604 [2,3,4,5,7,9,10,11,12,13,15,16,17] []
9 609147992 1338138612 1947286604 [0,1,4,7,8,10,13,14,15,16,17,26] []
10 854451916 1092834688 1947286604 [0,1,4,5,7,8,9,11,12,13,14,15,16,17,26] []
11 760893328 1186393276 1947286604 [1,4,5,6,7,8,10,12,13,14,15,16,17,26] []
12 730109256 1217177348 1947286604 [0,6,7,8,9,10,11,13,15,16,17,26] []
13 777029008 1170257596 1947286604 [0,1,2,3,6,7,8,9,10,11,12,14,15,16] []
14 844469760 1102816844 1947286604 [0,1,2,3,6,7,9,10,11,13,15,16,17,26] []
15 778122444 1169164160 1947286604 [0,2,3,4,6,9,10,11,14,16] []
16 664960388 1282326216 1947286604 [1,2,3,4,5,6,7,8,10,11,12,13,15,17] []
17 842428012 1104858592 1947286604 [0,1,2,3,4,5,8,9,10,11,12,13,14,16,18] []
18 1285869748 3590537232 4876406980 [0,1,17,20,21,22,23,24,25,26] []
19 1368764192 3507642788 4876406980 [0,1,18,20,21,22,23,24,25,26] []
20 1427417120 3448989860 4876406980 [4,17,18,19,21,22,23,24,25,26] []
21 1367928664 3508478316 4876406980 [4,5,18,19,20,22,23,24,25,26] []
22 1464361956 3412045024 4876406980 [4,5,18,19,20,21,23,24,25,26] []
23 1292415092 3583991888 4876406980 [4,5,18,19,20,21,22,24,25,26] []
24 1277731204 3598675776 4876406980 [4,5,18,19,20,21,22,23,25,26] []
25 1272703828 3603703152 4876406980 [4,5,18,19,20,21,22,23,24,26] []
26 815682292 1131604312 1947286604 [0,1,2,3,4,5,6,7,8,10,11,12,13,25] []

Ceph health detail

HEALTH_WARN 1 pgs stuck unclean; recovery 3359/6120420 objects misplaced (0.055%); noout,nodeep-scrub flag(s) set
pg 1.11d is stuck unclean for 2081576.511195, current state active+remapped, last acting [6,26]
recovery 3359/6120420 objects misplaced (0.055%)
noout,nodeep-scrub flag(s) set

Ceph version

ceph version 0.94.10 (b1e0532418e4631af01acbc0cedd426f1905f4af)

Regards,
Nasos Pan
Wido den Hollander
2018-12-03 13:53:55 UTC
Permalink
Hi,

How old is this cluster? As this might be a CRUSH tunables issue where
this pops up.

You can try (might move a lot of data!)

$ ceph osd getcrushmap -o crushmap.backup
$ ceph osd crush tunables optimal

If things go wrong you always have the old CRUSHmap:

$ ceph osd setcrushmap -i crushmap.backup

0.94.10 is EOL as well, I would consider upgrading after this PG becomes
active+clean

Wido

On 12/3/18 2:51 PM, Athanasios Panterlis wrote:
> Hi all,
>
> I am managing a typical small ceph cluster that consists of 4 nodes with
> each one having 7 OSDs (some in hdd pool, some in ssd pool)
>
> Having a healthy cluster and following some space issues due to bad pg
> management from ceph, I tried some reweighs in specific OSDs.
> Unfortunately the reballancing after reweigh a specific pg went to
> active+remmaped state and I have now a couple of misplaced objects
> (3359/6118678 objects misplaced (0.055%))
>
> Cluster had (for iops reasons) the scrub and deep-scrub options
> disabled. I enabled a simple scrub to all PG and waited to run. It
> finished completely without problems I guess. I would like to avoid
> deep-scrub, but if this is going to help I will run it once.
>
> I am thinking of declaring OSD 26 as lost. This way, new pg copy from 6
> will be created, correct?
> Any other less harmful thoughts on how to fix it? 
>
> I attached all the information I could provide. I am also pasting them
> raw below:
>
> *Query for faulty PG:*
>
> {
>     "state": "active+remapped",
>     "snap_trimq": "[]",
>     "epoch": 11755,
>     "up": [
>         6
>     ],
>     "acting": [
>         6,
>         26
>     ],
>     "actingbackfill": [
>         "6",
>         "26"
>     ],
>     "info": {
>         "pgid": "1.11d",
>         "last_update": "11755'60561210",
>         "last_complete": "11755'60561210",
>         "log_tail": "11755'60558123",
>         "last_user_version": 60561210,
>         "last_backfill": "MAX",
>         "purged_snaps": "[1~33,36~22]",
>         "history": {
>             "epoch_created": 31,
>             "last_epoch_started": 11681,
>             "last_epoch_clean": 11681,
>             "last_epoch_split": 0,
>             "same_up_since": 11679,
>             "same_interval_since": 11680,
>             "same_primary_since": 11510,
>             "last_scrub": "449'16483",
>             "last_scrub_stamp": "2016-09-14 15:25:14.228231",
>             "last_deep_scrub": "448'16277",
>             "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
>             "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
>         },
>         "stats": {
>             "version": "11755'60561210",
>             "reported_seq": "50924585",
>             "reported_epoch": "11755",
>             "state": "active+remapped",
>             "last_fresh": "2018-12-03 12:58:03.289251",
>             "last_change": "2018-11-09 10:54:06.861873",
>             "last_active": "2018-12-03 12:58:03.289251",
>             "last_peered": "2018-12-03 12:58:03.289251",
>             "last_clean": "2018-11-09 10:54:02.622866",
>             "last_became_active": "0.000000",
>             "last_became_peered": "0.000000",
>             "last_unstale": "2018-12-03 12:58:03.289251",
>             "last_undegraded": "2018-12-03 12:58:03.289251",
>             "last_fullsized": "2018-12-03 12:58:03.289251",
>             "mapping_epoch": 11679,
>             "log_start": "11755'60558123",
>             "ondisk_log_start": "11755'60558123",
>             "created": 31,
>             "last_epoch_clean": 11681,
>             "parent": "0.0",
>             "parent_split_bits": 0,
>             "last_scrub": "449'16483",
>             "last_scrub_stamp": "2016-09-14 15:25:14.228231",
>             "last_deep_scrub": "448'16277",
>             "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
>             "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
>             "log_size": 3087,
>             "ondisk_log_size": 3087,
>             "stats_invalid": "0",
>             "stat_sum": {
>                 "num_bytes": 14031434258,
>                 "num_objects": 3359,
>                 "num_object_clones": 0,
>                 "num_object_copies": 6718,
>                 "num_objects_missing_on_primary": 0,
>                 "num_objects_degraded": 0,
>                 "num_objects_misplaced": 3359,
>                 "num_objects_unfound": 0,
>                 "num_objects_dirty": 3359,
>                 "num_whiteouts": 0,
>                 "num_read": 27359423,
>                 "num_read_kb": 1815932413,
>                 "num_write": 121113356,
>                 "num_write_kb": 2124776643,
>                 "num_scrub_errors": 0,
>                 "num_shallow_scrub_errors": 0,
>                 "num_deep_scrub_errors": 0,
>                 "num_objects_recovered": 65218,
>                 "num_bytes_recovered": 271765903872,
>                 "num_keys_recovered": 0,
>                 "num_objects_omap": 0,
>                 "num_objects_hit_set_archive": 0,
>                 "num_bytes_hit_set_archive": 0
>             },
>             "up": [
>                 6
>             ],
>             "acting": [
>                 6,
>                 26
>             ],
>             "blocked_by": [],
>             "up_primary": 6,
>             "acting_primary": 6
>         },
>         "empty": 0,
>         "dne": 0,
>         "incomplete": 0,
>         "last_epoch_started": 11681,
>         "hit_set_history": {
>             "current_last_update": "0'0",
>             "current_last_stamp": "0.000000",
>             "current_info": {
>                 "begin": "0.000000",
>                 "end": "0.000000",
>                 "version": "0'0",
>                 "using_gmt": "1"
>             },
>             "history": []
>         }
>     },
>     "peer_info": [
>         {
>             "peer": "26",
>             "pgid": "1.11d",
>             "last_update": "11755'60561210",
>             "last_complete": "11755'60561210",
>             "log_tail": "11649'58446601",
>             "last_user_version": 58449647,
>             "last_backfill": "MAX",
>             "purged_snaps": "[1~33,36~22]",
>             "history": {
>                 "epoch_created": 31,
>                 "last_epoch_started": 11681,
>                 "last_epoch_clean": 11681,
>                 "last_epoch_split": 0,
>                 "same_up_since": 11679,
>                 "same_interval_since": 11680,
>                 "same_primary_since": 11510,
>                 "last_scrub": "449'16483",
>                 "last_scrub_stamp": "2016-09-14 15:25:14.228231",
>                 "last_deep_scrub": "448'16277",
>                 "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
>                 "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
>             },
>             "stats": {
>                 "version": "11678'58449646",
>                 "reported_seq": "48950066",
>                 "reported_epoch": "11678",
>                 "state": "active+clean",
>                 "last_fresh": "2018-11-09 10:54:02.263168",
>                 "last_change": "2018-11-09 08:01:12.116827",
>                 "last_active": "2018-11-09 10:54:02.263168",
>                 "last_peered": "2018-11-09 10:54:02.263168",
>                 "last_clean": "2018-11-09 10:54:02.263168",
>                 "last_became_active": "0.000000",
>                 "last_became_peered": "0.000000",
>                 "last_unstale": "2018-11-09 10:54:02.263168",
>                 "last_undegraded": "2018-11-09 10:54:02.263168",
>                 "last_fullsized": "2018-11-09 10:54:02.263168",
>                 "mapping_epoch": 11679,
>                 "log_start": "11649'58446601",
>                 "ondisk_log_start": "11649'58446601",
>                 "created": 31,
>                 "last_epoch_clean": 11610,
>                 "parent": "0.0",
>                 "parent_split_bits": 0,
>                 "last_scrub": "449'16483",
>                 "last_scrub_stamp": "2016-09-14 15:25:14.228231",
>                 "last_deep_scrub": "448'16277",
>                 "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
>                 "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
>                 "log_size": 3045,
>                 "ondisk_log_size": 3045,
>                 "stats_invalid": "0",
>                 "stat_sum": {
>                     "num_bytes": 18153595392,
>                     "num_objects": 4344,
>                     "num_object_clones": 0,
>                     "num_object_copies": 8688,
>                     "num_objects_missing_on_primary": 0,
>                     "num_objects_degraded": 0,
>                     "num_objects_misplaced": 0,
>                     "num_objects_unfound": 0,
>                     "num_objects_dirty": 4344,
>                     "num_whiteouts": 0,
>                     "num_read": 26674601,
>                     "num_read_kb": 1767105243,
>                     "num_write": 116892449,
>                     "num_write_kb": 2073693377,
>                     "num_scrub_errors": 0,
>                     "num_shallow_scrub_errors": 0,
>                     "num_deep_scrub_errors": 0,
>                     "num_objects_recovered": 65218,
>                     "num_bytes_recovered": 271765903872,
>                     "num_keys_recovered": 0,
>                     "num_objects_omap": 0,
>                     "num_objects_hit_set_archive": 0,
>                     "num_bytes_hit_set_archive": 0
>                 },
>                 "up": [
>                     6
>                 ],
>                 "acting": [
>                     6,
>                     26
>                 ],
>                 "blocked_by": [],
>                 "up_primary": 6,
>                 "acting_primary": 6
>             },
>             "empty": 0,
>             "dne": 0,
>             "incomplete": 0,
>             "last_epoch_started": 11681,
>             "hit_set_history": {
>                 "current_last_update": "0'0",
>                 "current_last_stamp": "0.000000",
>                 "current_info": {
>                     "begin": "0.000000",
>                     "end": "0.000000",
>                     "version": "0'0",
>                     "using_gmt": "1"
>                 },
>                 "history": []
>             }
>         }
>     ],
>     "recovery_state": [
>         {
>             "name": "Started\/Primary\/Active",
>             "enter_time": "2018-11-09 10:54:06.825830",
>             "might_have_unfound": [],
>             "recovery_progress": {
>                 "backfill_targets": [],
>                 "waiting_on_backfill": [],
>                 "last_backfill_started": "-1\/0\/\/0",
>                 "backfill_info": {
>                     "begin": "-1\/0\/\/0",
>                     "end": "-1\/0\/\/0",
>                     "objects": []
>                 },
>                 "peer_backfill_info": [],
>                 "backfills_in_flight": [],
>                 "recovering": [],
>                 "pg_backend": {
>                     "pull_from_peer": [],
>                     "pushing": []
>                 }
>             },
>             "scrub": {
>                 "scrubber.epoch_start": "0",
>                 "scrubber.active": 0,
>                 "scrubber.waiting_on": 0,
>                 "scrubber.waiting_on_whom": []
>             }
>         },
>         {
>             "name": "Started",
>             "enter_time": "2018-11-09 10:54:05.789621"
>         }
>     ],
>     "agent_state": {}
> }
>
> *Ceph status*
>
>     health HEALTH_WARN
>             1 pgs stuck unclean
>             recovery 3359/6118678 objects misplaced (0.055%)
>             noout,nodeep-scrub flag(s) set
>      monmap e3: 3 mons at
> {0=192.168.1.1:6789/0,1=192.168.1.2:6789/0,2=192.168.1.3:6789/0}
>             election epoch 4882, quorum 0,1,2 0,1,2
>      osdmap e11755: 27 osds: 27 up, 27 in; 1 remapped pgs
>             flags noout,nodeep-scrub
>       pgmap v62734988: 1024 pgs, 2 pools, 10183 GB data, 2557 kobjects
>             23768 GB used, 48720 GB / 72488 GB avail
>             3359/6118678 objects misplaced (0.055%)
>                 1023 active+clean
>                    1 active+remapped
>   client io 141 kB/s rd, 14068 kB/s wr, 925 op/s
>
> 2018-12-03 12:58:52.109913 mon.0 [INF] pgmap v62734987: 1024 pgs: 1
> active+remapped, 1023 active+clean; 10183 GB data, 23768 GB used, 48720
> GB / 72488 GB avail; 8325 kB/s rd, 16182 kB/s wr, 1704 op/s;
> 3359/6118678 objects misplaced (0.055%)
>
> *OSD tree*
>
> ID  WEIGHT   REWEIGHT SIZE   USE    AVAIL  %USE  VAR  TYPE NAME        
>      
> -11 37.19995        - 37204G 10257G 26946G 27.57 0.84 root hdd          
>    
> -12  9.29999        -  9301G  2531G  6769G 27.22 0.83     host hdd-node1
>  18  4.64999  1.00000  4650G  1226G  3424G 26.37 0.80         osd.18    
>    
>  19  4.64999  1.00000  4650G  1305G  3345G 28.06 0.86         osd.19    
>    
> -13  9.29999        -  9301G  2665G  6635G 28.66 0.87     host hdd-node2
>  20  4.64999  1.00000  4650G  1361G  3289G 29.27 0.89         osd.20    
>    
>  21  4.64999  1.00000  4650G  1304G  3346G 28.05 0.86         osd.21    
>    
> -14  9.29999        -  9301G  2628G  6672G 28.26 0.86     host hdd-node3
>  22  4.64999  1.00000  4650G  1396G  3254G 30.02 0.92         osd.22    
>    
>  23  4.64999  1.00000  4650G  1232G  3418G 26.50 0.81         osd.23    
>    
> -15  9.29999        -  9301G  2431G  6869G 26.15 0.80     host hdd-node4
>  24  4.64999  1.00000  4650G  1218G  3432G 26.20 0.80         osd.24    
>    
>  25  4.64999  1.00000  4650G  1213G  3436G 26.09 0.80         osd.25    
>    
>  -1 35.14995        - 35284G 13512G 21771G 38.30 1.17 root default      
>    
>  -2  9.25000        -  9285G  3431G  5853G 36.96 1.13     host node1    
>   0  1.84999  1.00000  1857G   765G  1091G 41.24 1.26         osd.0    
>      
>   1  1.84999  1.00000  1857G   633G  1224G 34.09 1.04         osd.1    
>      
>   6  1.84999  1.00000  1857G   777G  1079G 41.88 1.28         osd.6    
>      
>   7  1.84999  0.89999  1857G   752G  1104G 40.54 1.24         osd.7    
>      
>   8  1.84999  1.00000  1857G   502G  1354G 27.06 0.83         osd.8    
>      
>  -3  9.24995        -  9285G  3562G  5722G 38.37 1.17     host node2    
>   2  1.84999  1.00000  1857G   766G  1090G 41.27 1.26         osd.2    
>      
>   3  1.84999  0.70000  1857G   674G  1182G 36.33 1.11         osd.3    
>      
>   9  1.84999  1.00000  1857G   580G  1276G 31.28 0.95         osd.9    
>      
>  10  1.84999  1.00000  1857G   814G  1042G 43.88 1.34         osd.10    
>    
>  11  1.84999  1.00000  1857G   725G  1131G 39.07 1.19         osd.11    
>    
>  -4  9.25000        -  9285G  3561G  5724G 38.35 1.17     host node3    
>   4  1.84999  1.00000  1857G   684G  1172G 36.88 1.12         osd.4    
>      
>   5  1.84999  1.00000  1857G   633G  1223G 34.11 1.04         osd.5    
>      
>  12  1.84999  1.00000  1857G   696G  1160G 37.49 1.14         osd.12    
>    
>  13  1.84999  0.70000  1857G   741G  1116G 39.90 1.22         osd.13    
>    
>  14  1.84999  0.89999  1857G   805G  1051G 43.37 1.32         osd.14    
>    
>  -5  7.39999        -  7428G  2957G  4470G 39.81 1.21     host node4    
>  15  1.84999  0.79999  1857G   742G  1115G 39.96 1.22         osd.15    
>    
>  16  1.84999  1.00000  1857G   634G  1222G 34.15 1.04         osd.16    
>    
>  17  1.84999  0.89999  1857G   803G  1053G 43.26 1.32         osd.17    
>    
>  26  1.84999  0.81000  1857G   777G  1079G 41.89 1.28         osd.26    
>    
>                 TOTAL 72488G 23770G 48718G 32.79                        
>    
> MIN/MAX VAR: 0.80/1.34  STDDEV: 6.56
>
> *PG dump*
>
> version 62735224
> stamp 2018-12-03 13:02:52.799643
> last_osdmap_epoch 11755
> last_pg_scan 9537
> full_ratio 0.95
> nearfull_ratio 0.85
> pg_stat objects mip degr misp unf bytes log disklog state state_stamp v
> reported up up_primary acting acting_primary last_scrub scrub_stamp
> last_deep_scrub deep_scrub_stamp
>
> ///active+clean ones removed///
>
> 1.11d 3359 0 0 3359 0 14031434258 3034 3034 active+remapped 2018-11-09
> 10:54:06.861873 11755'60561357 11755:50924695 [6] 6 [6,26] 6 449'16483
> 2016-09-14 15:25:14.228231 448'16277 2016-09-13 06:11:45.633007
>
> ///active+clean ones removed///
>
> pool 1 1738101 0 0 3359 0 7253679601802 1562466 1562466
> pool 2 881071 0 0 0 0 3682217717410 1561924 1561924
>  sum 2619172 0 0 3359 0 10935897319212 3124390 3124390
> osdstat kbused kbavail kb hb in hb out
> 0 803034868 1144251736 1947286604 [1,2,3,4,5,10,12,13,14,15,16,26] []
> 1 663754736 1283531868 1947286604 [0,2,3,5,10,12,13,15,16,17,26] []
> 2 803619260 1143667344 1947286604 [0,1,3,4,5,6,7,8,13,14,15,16,17,26] []
> 3 707438640 1239847964 1947286604 [0,1,2,4,5,6,7,8,12,13,14,15,17] []
> 4 718194072 1229092532 1947286604 [2,3,5,6,8,10,11,15,16,17,26] []
> 5 664279112 1283007492 1947286604 [0,1,2,3,4,6,8,10,11,15,17,26] []
> 6 815455088 1131831516 1947286604 [2,4,5,7,9,10,11,12,13,14,15,16,17,26] []
> 7 789396940 1157889664 1947286604 [2,4,6,8,9,10,11,12,13,14,15,16,17,26] []
> 8 526871252 1420415352 1947286604 [2,3,4,5,7,9,10,11,12,13,15,16,17] []
> 9 609147992 1338138612 1947286604 [0,1,4,7,8,10,13,14,15,16,17,26] []
> 10 854451916 1092834688 1947286604
> [0,1,4,5,7,8,9,11,12,13,14,15,16,17,26] []
> 11 760893328 1186393276 1947286604 [1,4,5,6,7,8,10,12,13,14,15,16,17,26] []
> 12 730109256 1217177348 1947286604 [0,6,7,8,9,10,11,13,15,16,17,26] []
> 13 777029008 1170257596 1947286604 [0,1,2,3,6,7,8,9,10,11,12,14,15,16] []
> 14 844469760 1102816844 1947286604 [0,1,2,3,6,7,9,10,11,13,15,16,17,26] []
> 15 778122444 1169164160 1947286604 [0,2,3,4,6,9,10,11,14,16] []
> 16 664960388 1282326216 1947286604 [1,2,3,4,5,6,7,8,10,11,12,13,15,17] []
> 17 842428012 1104858592 1947286604 [0,1,2,3,4,5,8,9,10,11,12,13,14,16,18] []
> 18 1285869748 3590537232 4876406980 [0,1,17,20,21,22,23,24,25,26] []
> 19 1368764192 3507642788 4876406980 [0,1,18,20,21,22,23,24,25,26] []
> 20 1427417120 3448989860 4876406980 [4,17,18,19,21,22,23,24,25,26] []
> 21 1367928664 3508478316 4876406980 [4,5,18,19,20,22,23,24,25,26] []
> 22 1464361956 3412045024 4876406980 [4,5,18,19,20,21,23,24,25,26] []
> 23 1292415092 3583991888 4876406980 [4,5,18,19,20,21,22,24,25,26] []
> 24 1277731204 3598675776 4876406980 [4,5,18,19,20,21,22,23,25,26] []
> 25 1272703828 3603703152 4876406980 [4,5,18,19,20,21,22,23,24,26] []
> 26 815682292 1131604312 1947286604 [0,1,2,3,4,5,6,7,8,10,11,12,13,25] []
>
> *Ceph health detail*
>
> HEALTH_WARN 1 pgs stuck unclean; recovery 3359/6120420 objects misplaced
> (0.055%); noout,nodeep-scrub flag(s) set
> pg 1.11d is stuck unclean for 2081576.511195, current state
> active+remapped, last acting [6,26]
> recovery 3359/6120420 objects misplaced (0.055%)
> noout,nodeep-scrub flag(s) set
>
> *Ceph version*
>
> ceph version 0.94.10 (b1e0532418e4631af01acbc0cedd426f1905f4af)
>
> Regards,
> Nasos Pan
>
> _______________________________________________
> ceph-users mailing list
> ceph-***@lists.ceph.com
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
Athanasios Panterlis
2018-12-03 15:21:04 UTC
Permalink
Hi Wido,

Yeap its quite old, since 2016. Its from a decommissioned cluster that we just keep in healthy state without much update efforts.
I had in plan to do a clean up of unwanted disks snapshots etc, do a few re-weights, update it to latest stable (just like correctly you mentioned) and then just maintain it. I am not proud of this, but that's the sad story for this cluster.

Thanks for tunables proposal. I will consider this. But I fear that more reweighs will bring more problems.
https://access.redhat.com/documentation/en-us/red_hat_ceph_storage/1.2.3/html/storage_strategies/crush_tunables

Can I somehow check the adjustments that are about to be edited with this command prior of enabling them?

I am hoping for a less resource hungry operation that will be focused in this particular PG or problematic OSDS.
Maybe just remove these OSDs completely one by one and reinsert them?
Maybe I can create a new pool and just migrate one by one my disks? Removing old pool should take down with it the faulty PG, correct?

I am starting to get anxious for data integrity or ceph readonly state also in case OSD 6 or 26 have availability issues...

Thanks for quick reply!

Regards,
Nasos Panterlis
________________________________
From: Wido den Hollander <***@42on.com>
Sent: Monday, December 3, 2018 3:53 PM
To: Athanasios Panterlis; ceph-***@lists.ceph.com
Subject: Re: [ceph-users] PG problem after reweight (1 PG active+remapped)

Hi,

How old is this cluster? As this might be a CRUSH tunables issue where
this pops up.

You can try (might move a lot of data!)

$ ceph osd getcrushmap -o crushmap.backup
$ ceph osd crush tunables optimal

If things go wrong you always have the old CRUSHmap:

$ ceph osd setcrushmap -i crushmap.backup

0.94.10 is EOL as well, I would consider upgrading after this PG becomes
active+clean

Wido

On 12/3/18 2:51 PM, Athanasios Panterlis wrote:
> Hi all,
>
> I am managing a typical small ceph cluster that consists of 4 nodes with
> each one having 7 OSDs (some in hdd pool, some in ssd pool)
>
> Having a healthy cluster and following some space issues due to bad pg
> management from ceph, I tried some reweighs in specific OSDs.
> Unfortunately the reballancing after reweigh a specific pg went to
> active+remmaped state and I have now a couple of misplaced objects
> (3359/6118678 objects misplaced (0.055%))
>
> Cluster had (for iops reasons) the scrub and deep-scrub options
> disabled. I enabled a simple scrub to all PG and waited to run. It
> finished completely without problems I guess. I would like to avoid
> deep-scrub, but if this is going to help I will run it once.
>
> I am thinking of declaring OSD 26 as lost. This way, new pg copy from 6
> will be created, correct?
> Any other less harmful thoughts on how to fix it?
>
> I attached all the information I could provide. I am also pasting them
> raw below:
>
> *Query for faulty PG:*
>
> {
> "state": "active+remapped",
> "snap_trimq": "[]",
> "epoch": 11755,
> "up": [
> 6
> ],
> "acting": [
> 6,
> 26
> ],
> "actingbackfill": [
> "6",
> "26"
> ],
> "info": {
> "pgid": "1.11d",
> "last_update": "11755'60561210",
> "last_complete": "11755'60561210",
> "log_tail": "11755'60558123",
> "last_user_version": 60561210,
> "last_backfill": "MAX",
> "purged_snaps": "[1~33,36~22]",
> "history": {
> "epoch_created": 31,
> "last_epoch_started": 11681,
> "last_epoch_clean": 11681,
> "last_epoch_split": 0,
> "same_up_since": 11679,
> "same_interval_since": 11680,
> "same_primary_since": 11510,
> "last_scrub": "449'16483",
> "last_scrub_stamp": "2016-09-14 15:25:14.228231",
> "last_deep_scrub": "448'16277",
> "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
> "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
> },
> "stats": {
> "version": "11755'60561210",
> "reported_seq": "50924585",
> "reported_epoch": "11755",
> "state": "active+remapped",
> "last_fresh": "2018-12-03 12:58:03.289251",
> "last_change": "2018-11-09 10:54:06.861873",
> "last_active": "2018-12-03 12:58:03.289251",
> "last_peered": "2018-12-03 12:58:03.289251",
> "last_clean": "2018-11-09 10:54:02.622866",
> "last_became_active": "0.000000",
> "last_became_peered": "0.000000",
> "last_unstale": "2018-12-03 12:58:03.289251",
> "last_undegraded": "2018-12-03 12:58:03.289251",
> "last_fullsized": "2018-12-03 12:58:03.289251",
> "mapping_epoch": 11679,
> "log_start": "11755'60558123",
> "ondisk_log_start": "11755'60558123",
> "created": 31,
> "last_epoch_clean": 11681,
> "parent": "0.0",
> "parent_split_bits": 0,
> "last_scrub": "449'16483",
> "last_scrub_stamp": "2016-09-14 15:25:14.228231",
> "last_deep_scrub": "448'16277",
> "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
> "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
> "log_size": 3087,
> "ondisk_log_size": 3087,
> "stats_invalid": "0",
> "stat_sum": {
> "num_bytes": 14031434258,
> "num_objects": 3359,
> "num_object_clones": 0,
> "num_object_copies": 6718,
> "num_objects_missing_on_primary": 0,
> "num_objects_degraded": 0,
> "num_objects_misplaced": 3359,
> "num_objects_unfound": 0,
> "num_objects_dirty": 3359,
> "num_whiteouts": 0,
> "num_read": 27359423,
> "num_read_kb": 1815932413,
> "num_write": 121113356,
> "num_write_kb": 2124776643,
> "num_scrub_errors": 0,
> "num_shallow_scrub_errors": 0,
> "num_deep_scrub_errors": 0,
> "num_objects_recovered": 65218,
> "num_bytes_recovered": 271765903872,
> "num_keys_recovered": 0,
> "num_objects_omap": 0,
> "num_objects_hit_set_archive": 0,
> "num_bytes_hit_set_archive": 0
> },
> "up": [
> 6
> ],
> "acting": [
> 6,
> 26
> ],
> "blocked_by": [],
> "up_primary": 6,
> "acting_primary": 6
> },
> "empty": 0,
> "dne": 0,
> "incomplete": 0,
> "last_epoch_started": 11681,
> "hit_set_history": {
> "current_last_update": "0'0",
> "current_last_stamp": "0.000000",
> "current_info": {
> "begin": "0.000000",
> "end": "0.000000",
> "version": "0'0",
> "using_gmt": "1"
> },
> "history": []
> }
> },
> "peer_info": [
> {
> "peer": "26",
> "pgid": "1.11d",
> "last_update": "11755'60561210",
> "last_complete": "11755'60561210",
> "log_tail": "11649'58446601",
> "last_user_version": 58449647,
> "last_backfill": "MAX",
> "purged_snaps": "[1~33,36~22]",
> "history": {
> "epoch_created": 31,
> "last_epoch_started": 11681,
> "last_epoch_clean": 11681,
> "last_epoch_split": 0,
> "same_up_since": 11679,
> "same_interval_since": 11680,
> "same_primary_since": 11510,
> "last_scrub": "449'16483",
> "last_scrub_stamp": "2016-09-14 15:25:14.228231",
> "last_deep_scrub": "448'16277",
> "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
> "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
> },
> "stats": {
> "version": "11678'58449646",
> "reported_seq": "48950066",
> "reported_epoch": "11678",
> "state": "active+clean",
> "last_fresh": "2018-11-09 10:54:02.263168",
> "last_change": "2018-11-09 08:01:12.116827",
> "last_active": "2018-11-09 10:54:02.263168",
> "last_peered": "2018-11-09 10:54:02.263168",
> "last_clean": "2018-11-09 10:54:02.263168",
> "last_became_active": "0.000000",
> "last_became_peered": "0.000000",
> "last_unstale": "2018-11-09 10:54:02.263168",
> "last_undegraded": "2018-11-09 10:54:02.263168",
> "last_fullsized": "2018-11-09 10:54:02.263168",
> "mapping_epoch": 11679,
> "log_start": "11649'58446601",
> "ondisk_log_start": "11649'58446601",
> "created": 31,
> "last_epoch_clean": 11610,
> "parent": "0.0",
> "parent_split_bits": 0,
> "last_scrub": "449'16483",
> "last_scrub_stamp": "2016-09-14 15:25:14.228231",
> "last_deep_scrub": "448'16277",
> "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
> "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
> "log_size": 3045,
> "ondisk_log_size": 3045,
> "stats_invalid": "0",
> "stat_sum": {
> "num_bytes": 18153595392,
> "num_objects": 4344,
> "num_object_clones": 0,
> "num_object_copies": 8688,
> "num_objects_missing_on_primary": 0,
> "num_objects_degraded": 0,
> "num_objects_misplaced": 0,
> "num_objects_unfound": 0,
> "num_objects_dirty": 4344,
> "num_whiteouts": 0,
> "num_read": 26674601,
> "num_read_kb": 1767105243,
> "num_write": 116892449,
> "num_write_kb": 2073693377,
> "num_scrub_errors": 0,
> "num_shallow_scrub_errors": 0,
> "num_deep_scrub_errors": 0,
> "num_objects_recovered": 65218,
> "num_bytes_recovered": 271765903872,
> "num_keys_recovered": 0,
> "num_objects_omap": 0,
> "num_objects_hit_set_archive": 0,
> "num_bytes_hit_set_archive": 0
> },
> "up": [
> 6
> ],
> "acting": [
> 6,
> 26
> ],
> "blocked_by": [],
> "up_primary": 6,
> "acting_primary": 6
> },
> "empty": 0,
> "dne": 0,
> "incomplete": 0,
> "last_epoch_started": 11681,
> "hit_set_history": {
> "current_last_update": "0'0",
> "current_last_stamp": "0.000000",
> "current_info": {
> "begin": "0.000000",
> "end": "0.000000",
> "version": "0'0",
> "using_gmt": "1"
> },
> "history": []
> }
> }
> ],
> "recovery_state": [
> {
> "name": "Started\/Primary\/Active",
> "enter_time": "2018-11-09 10:54:06.825830",
> "might_have_unfound": [],
> "recovery_progress": {
> "backfill_targets": [],
> "waiting_on_backfill": [],
> "last_backfill_started": "-1\/0\/\/0",
> "backfill_info": {
> "begin": "-1\/0\/\/0",
> "end": "-1\/0\/\/0",
> "objects": []
> },
> "peer_backfill_info": [],
> "backfills_in_flight": [],
> "recovering": [],
> "pg_backend": {
> "pull_from_peer": [],
> "pushing": []
> }
> },
> "scrub": {
> "scrubber.epoch_start": "0",
> "scrubber.active": 0,
> "scrubber.waiting_on": 0,
> "scrubber.waiting_on_whom": []
> }
> },
> {
> "name": "Started",
> "enter_time": "2018-11-09 10:54:05.789621"
> }
> ],
> "agent_state": {}
> }
>
> *Ceph status*
>
> health HEALTH_WARN
> 1 pgs stuck unclean
> recovery 3359/6118678 objects misplaced (0.055%)
> noout,nodeep-scrub flag(s) set
> monmap e3: 3 mons at
> {0=192.168.1.1:6789/0,1=192.168.1.2:6789/0,2=192.168.1.3:6789/0}
> election epoch 4882, quorum 0,1,2 0,1,2
> osdmap e11755: 27 osds: 27 up, 27 in; 1 remapped pgs
> flags noout,nodeep-scrub
> pgmap v62734988: 1024 pgs, 2 pools, 10183 GB data, 2557 kobjects
> 23768 GB used, 48720 GB / 72488 GB avail
> 3359/6118678 objects misplaced (0.055%)
> 1023 active+clean
> 1 active+remapped
> client io 141 kB/s rd, 14068 kB/s wr, 925 op/s
>
> 2018-12-03 12:58:52.109913 mon.0 [INF] pgmap v62734987: 1024 pgs: 1
> active+remapped, 1023 active+clean; 10183 GB data, 23768 GB used, 48720
> GB / 72488 GB avail; 8325 kB/s rd, 16182 kB/s wr, 1704 op/s;
> 3359/6118678 objects misplaced (0.055%)
>
> *OSD tree*
>
> ID WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR TYPE NAME
>
> -11 37.19995 - 37204G 10257G 26946G 27.57 0.84 root hdd
>
> -12 9.29999 - 9301G 2531G 6769G 27.22 0.83 host hdd-node1
> 18 4.64999 1.00000 4650G 1226G 3424G 26.37 0.80 osd.18
>
> 19 4.64999 1.00000 4650G 1305G 3345G 28.06 0.86 osd.19
>
> -13 9.29999 - 9301G 2665G 6635G 28.66 0.87 host hdd-node2
> 20 4.64999 1.00000 4650G 1361G 3289G 29.27 0.89 osd.20
>
> 21 4.64999 1.00000 4650G 1304G 3346G 28.05 0.86 osd.21
>
> -14 9.29999 - 9301G 2628G 6672G 28.26 0.86 host hdd-node3
> 22 4.64999 1.00000 4650G 1396G 3254G 30.02 0.92 osd.22
>
> 23 4.64999 1.00000 4650G 1232G 3418G 26.50 0.81 osd.23
>
> -15 9.29999 - 9301G 2431G 6869G 26.15 0.80 host hdd-node4
> 24 4.64999 1.00000 4650G 1218G 3432G 26.20 0.80 osd.24
>
> 25 4.64999 1.00000 4650G 1213G 3436G 26.09 0.80 osd.25
>
> -1 35.14995 - 35284G 13512G 21771G 38.30 1.17 root default
>
> -2 9.25000 - 9285G 3431G 5853G 36.96 1.13 host node1
> 0 1.84999 1.00000 1857G 765G 1091G 41.24 1.26 osd.0
>
> 1 1.84999 1.00000 1857G 633G 1224G 34.09 1.04 osd.1
>
> 6 1.84999 1.00000 1857G 777G 1079G 41.88 1.28 osd.6
>
> 7 1.84999 0.89999 1857G 752G 1104G 40.54 1.24 osd.7
>
> 8 1.84999 1.00000 1857G 502G 1354G 27.06 0.83 osd.8
>
> -3 9.24995 - 9285G 3562G 5722G 38.37 1.17 host node2
> 2 1.84999 1.00000 1857G 766G 1090G 41.27 1.26 osd.2
>
> 3 1.84999 0.70000 1857G 674G 1182G 36.33 1.11 osd.3
>
> 9 1.84999 1.00000 1857G 580G 1276G 31.28 0.95 osd.9
>
> 10 1.84999 1.00000 1857G 814G 1042G 43.88 1.34 osd.10
>
> 11 1.84999 1.00000 1857G 725G 1131G 39.07 1.19 osd.11
>
> -4 9.25000 - 9285G 3561G 5724G 38.35 1.17 host node3
> 4 1.84999 1.00000 1857G 684G 1172G 36.88 1.12 osd.4
>
> 5 1.84999 1.00000 1857G 633G 1223G 34.11 1.04 osd.5
>
> 12 1.84999 1.00000 1857G 696G 1160G 37.49 1.14 osd.12
>
> 13 1.84999 0.70000 1857G 741G 1116G 39.90 1.22 osd.13
>
> 14 1.84999 0.89999 1857G 805G 1051G 43.37 1.32 osd.14
>
> -5 7.39999 - 7428G 2957G 4470G 39.81 1.21 host node4
> 15 1.84999 0.79999 1857G 742G 1115G 39.96 1.22 osd.15
>
> 16 1.84999 1.00000 1857G 634G 1222G 34.15 1.04 osd.16
>
> 17 1.84999 0.89999 1857G 803G 1053G 43.26 1.32 osd.17
>
> 26 1.84999 0.81000 1857G 777G 1079G 41.89 1.28 osd.26
>
> TOTAL 72488G 23770G 48718G 32.79
>
> MIN/MAX VAR: 0.80/1.34 STDDEV: 6.56
>
> *PG dump*
>
> version 62735224
> stamp 2018-12-03 13:02:52.799643
> last_osdmap_epoch 11755
> last_pg_scan 9537
> full_ratio 0.95
> nearfull_ratio 0.85
> pg_stat objects mip degr misp unf bytes log disklog state state_stamp v
> reported up up_primary acting acting_primary last_scrub scrub_stamp
> last_deep_scrub deep_scrub_stamp
>
> ///active+clean ones removed///
>
> 1.11d 3359 0 0 3359 0 14031434258 3034 3034 active+remapped 2018-11-09
> 10:54:06.861873 11755'60561357 11755:50924695 [6] 6 [6,26] 6 449'16483
> 2016-09-14 15:25:14.228231 448'16277 2016-09-13 06:11:45.633007
>
> ///active+clean ones removed///
>
> pool 1 1738101 0 0 3359 0 7253679601802 1562466 1562466
> pool 2 881071 0 0 0 0 3682217717410 1561924 1561924
> sum 2619172 0 0 3359 0 10935897319212 3124390 3124390
> osdstat kbused kbavail kb hb in hb out
> 0 803034868 1144251736 1947286604 [1,2,3,4,5,10,12,13,14,15,16,26] []
> 1 663754736 1283531868 1947286604 [0,2,3,5,10,12,13,15,16,17,26] []
> 2 803619260 1143667344 1947286604 [0,1,3,4,5,6,7,8,13,14,15,16,17,26] []
> 3 707438640 1239847964 1947286604 [0,1,2,4,5,6,7,8,12,13,14,15,17] []
> 4 718194072 1229092532 1947286604 [2,3,5,6,8,10,11,15,16,17,26] []
> 5 664279112 1283007492 1947286604 [0,1,2,3,4,6,8,10,11,15,17,26] []
> 6 815455088 1131831516 1947286604 [2,4,5,7,9,10,11,12,13,14,15,16,17,26] []
> 7 789396940 1157889664 1947286604 [2,4,6,8,9,10,11,12,13,14,15,16,17,26] []
> 8 526871252 1420415352 1947286604 [2,3,4,5,7,9,10,11,12,13,15,16,17] []
> 9 609147992 1338138612 1947286604 [0,1,4,7,8,10,13,14,15,16,17,26] []
> 10 854451916 1092834688 1947286604
> [0,1,4,5,7,8,9,11,12,13,14,15,16,17,26] []
> 11 760893328 1186393276 1947286604 [1,4,5,6,7,8,10,12,13,14,15,16,17,26] []
> 12 730109256 1217177348 1947286604 [0,6,7,8,9,10,11,13,15,16,17,26] []
> 13 777029008 1170257596 1947286604 [0,1,2,3,6,7,8,9,10,11,12,14,15,16] []
> 14 844469760 1102816844 1947286604 [0,1,2,3,6,7,9,10,11,13,15,16,17,26] []
> 15 778122444 1169164160 1947286604 [0,2,3,4,6,9,10,11,14,16] []
> 16 664960388 1282326216 1947286604 [1,2,3,4,5,6,7,8,10,11,12,13,15,17] []
> 17 842428012 1104858592 1947286604 [0,1,2,3,4,5,8,9,10,11,12,13,14,16,18] []
> 18 1285869748 3590537232 4876406980 [0,1,17,20,21,22,23,24,25,26] []
> 19 1368764192 3507642788 4876406980 [0,1,18,20,21,22,23,24,25,26] []
> 20 1427417120 3448989860 4876406980 [4,17,18,19,21,22,23,24,25,26] []
> 21 1367928664 3508478316 4876406980 [4,5,18,19,20,22,23,24,25,26] []
> 22 1464361956 3412045024 4876406980 [4,5,18,19,20,21,23,24,25,26] []
> 23 1292415092 3583991888 4876406980 [4,5,18,19,20,21,22,24,25,26] []
> 24 1277731204 3598675776 4876406980 [4,5,18,19,20,21,22,23,25,26] []
> 25 1272703828 3603703152 4876406980 [4,5,18,19,20,21,22,23,24,26] []
> 26 815682292 1131604312 1947286604 [0,1,2,3,4,5,6,7,8,10,11,12,13,25] []
>
> *Ceph health detail*
>
> HEALTH_WARN 1 pgs stuck unclean; recovery 3359/6120420 objects misplaced
> (0.055%); noout,nodeep-scrub flag(s) set
> pg 1.11d is stuck unclean for 2081576.511195, current state
> active+remapped, last acting [6,26]
> recovery 3359/6120420 objects misplaced (0.055%)
> noout,nodeep-scrub flag(s) set
>
> *Ceph version*
>
> ceph version 0.94.10 (b1e0532418e4631af01acbc0cedd426f1905f4af)
>
> Regards,
> Nasos Pan
>
> _______________________________________________
> ceph-users mailing list
> ceph-***@lists.ceph.com
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
Wido den Hollander
2018-12-03 16:51:44 UTC
Permalink
Hi,

On 12/3/18 4:21 PM, Athanasios Panterlis wrote:
> Hi Wido,
>
> Yeap its quite old, since 2016. Its from a decommissioned cluster that
> we just keep in healthy state without much update efforts.
> I had in plan to do a clean up of unwanted disks snapshots etc, do a few
> re-weights, update it to latest stable (just like correctly you
> mentioned) and then just maintain it. I am not proud of this, but that's
> the sad story for this cluster.
>
> Thanks for tunables proposal. I will consider this. But I fear that more
> reweighs will bring more problems.
> https://access.redhat.com/documentation/en-us/red_hat_ceph_storage/1.2.3/html/storage_strategies/crush_tunables
>
> Can I somehow check the adjustments that are about to be edited with
> this command prior of enabling them?
>

No, but once you enable them and you thinkg the rebalance is too big,
just insert the old map.

> I am hoping for a less resource hungry operation that will be focused in
> this particular PG or problematic OSDS.

No, this seems like a CRUSH mapping issue.

> Maybe just remove these OSDs completely one by one and reinsert them?
> Maybe I can create a new pool and just migrate one by one my disks?
> Removing old pool should take down with it the faulty PG, correct?
>

If you aren't using that pool, just removing it would be enough to
crrect the problem.

But do NOT remove any OSDs as that will not solve the problem. It might
make it worse.

Wido

> I am starting to get anxious for data integrity or ceph readonly state
> also in case OSD 6 or 26 have availability issues...
>
> Thanks for quick reply!
>
> Regards,
> Nasos Panterlis
> ------------------------------------------------------------------------
> *From:* Wido den Hollander <***@42on.com>
> *Sent:* Monday, December 3, 2018 3:53 PM
> *To:* Athanasios Panterlis; ceph-***@lists.ceph.com
> *Subject:* Re: [ceph-users] PG problem after reweight (1 PG
> active+remapped)
>  
> Hi,
>
> How old is this cluster? As this might be a CRUSH tunables issue where
> this pops up.
>
> You can try (might move a lot of data!)
>
> $ ceph osd getcrushmap -o crushmap.backup
> $ ceph osd crush tunables optimal
>
> If things go wrong you always have the old CRUSHmap:
>
> $ ceph osd setcrushmap -i crushmap.backup
>
> 0.94.10 is EOL as well, I would consider upgrading after this PG becomes
> active+clean
>
> Wido
>
> On 12/3/18 2:51 PM, Athanasios Panterlis wrote:
>> Hi all,
>>
>> I am managing a typical small ceph cluster that consists of 4 nodes with
>> each one having 7 OSDs (some in hdd pool, some in ssd pool)
>>
>> Having a healthy cluster and following some space issues due to bad pg
>> management from ceph, I tried some reweighs in specific OSDs.
>> Unfortunately the reballancing after reweigh a specific pg went to
>> active+remmaped state and I have now a couple of misplaced objects
>> (3359/6118678 objects misplaced (0.055%))
>>
>> Cluster had (for iops reasons) the scrub and deep-scrub options
>> disabled. I enabled a simple scrub to all PG and waited to run. It
>> finished completely without problems I guess. I would like to avoid
>> deep-scrub, but if this is going to help I will run it once.
>>
>> I am thinking of declaring OSD 26 as lost. This way, new pg copy from 6
>> will be created, correct?
>> Any other less harmful thoughts on how to fix it? 
>>
>> I attached all the information I could provide. I am also pasting them
>> raw below:
>>
>> *Query for faulty PG:*
>>
>> {
>>     "state": "active+remapped",
>>     "snap_trimq": "[]",
>>     "epoch": 11755,
>>     "up": [
>>         6
>>     ],
>>     "acting": [
>>         6,
>>         26
>>     ],
>>     "actingbackfill": [
>>         "6",
>>         "26"
>>     ],
>>     "info": {
>>         "pgid": "1.11d",
>>         "last_update": "11755'60561210",
>>         "last_complete": "11755'60561210",
>>         "log_tail": "11755'60558123",
>>         "last_user_version": 60561210,
>>         "last_backfill": "MAX",
>>         "purged_snaps": "[1~33,36~22]",
>>         "history": {
>>             "epoch_created": 31,
>>             "last_epoch_started": 11681,
>>             "last_epoch_clean": 11681,
>>             "last_epoch_split": 0,
>>             "same_up_since": 11679,
>>             "same_interval_since": 11680,
>>             "same_primary_since": 11510,
>>             "last_scrub": "449'16483",
>>             "last_scrub_stamp": "2016-09-14 15:25:14.228231",
>>             "last_deep_scrub": "448'16277",
>>             "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
>>             "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
>>         },
>>         "stats": {
>>             "version": "11755'60561210",
>>             "reported_seq": "50924585",
>>             "reported_epoch": "11755",
>>             "state": "active+remapped",
>>             "last_fresh": "2018-12-03 12:58:03.289251",
>>             "last_change": "2018-11-09 10:54:06.861873",
>>             "last_active": "2018-12-03 12:58:03.289251",
>>             "last_peered": "2018-12-03 12:58:03.289251",
>>             "last_clean": "2018-11-09 10:54:02.622866",
>>             "last_became_active": "0.000000",
>>             "last_became_peered": "0.000000",
>>             "last_unstale": "2018-12-03 12:58:03.289251",
>>             "last_undegraded": "2018-12-03 12:58:03.289251",
>>             "last_fullsized": "2018-12-03 12:58:03.289251",
>>             "mapping_epoch": 11679,
>>             "log_start": "11755'60558123",
>>             "ondisk_log_start": "11755'60558123",
>>             "created": 31,
>>             "last_epoch_clean": 11681,
>>             "parent": "0.0",
>>             "parent_split_bits": 0,
>>             "last_scrub": "449'16483",
>>             "last_scrub_stamp": "2016-09-14 15:25:14.228231",
>>             "last_deep_scrub": "448'16277",
>>             "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
>>             "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
>>             "log_size": 3087,
>>             "ondisk_log_size": 3087,
>>             "stats_invalid": "0",
>>             "stat_sum": {
>>                 "num_bytes": 14031434258,
>>                 "num_objects": 3359,
>>                 "num_object_clones": 0,
>>                 "num_object_copies": 6718,
>>                 "num_objects_missing_on_primary": 0,
>>                 "num_objects_degraded": 0,
>>                 "num_objects_misplaced": 3359,
>>                 "num_objects_unfound": 0,
>>                 "num_objects_dirty": 3359,
>>                 "num_whiteouts": 0,
>>                 "num_read": 27359423,
>>                 "num_read_kb": 1815932413,
>>                 "num_write": 121113356,
>>                 "num_write_kb": 2124776643,
>>                 "num_scrub_errors": 0,
>>                 "num_shallow_scrub_errors": 0,
>>                 "num_deep_scrub_errors": 0,
>>                 "num_objects_recovered": 65218,
>>                 "num_bytes_recovered": 271765903872,
>>                 "num_keys_recovered": 0,
>>                 "num_objects_omap": 0,
>>                 "num_objects_hit_set_archive": 0,
>>                 "num_bytes_hit_set_archive": 0
>>             },
>>             "up": [
>>                 6
>>             ],
>>             "acting": [
>>                 6,
>>                 26
>>             ],
>>             "blocked_by": [],
>>             "up_primary": 6,
>>             "acting_primary": 6
>>         },
>>         "empty": 0,
>>         "dne": 0,
>>         "incomplete": 0,
>>         "last_epoch_started": 11681,
>>         "hit_set_history": {
>>             "current_last_update": "0'0",
>>             "current_last_stamp": "0.000000",
>>             "current_info": {
>>                 "begin": "0.000000",
>>                 "end": "0.000000",
>>                 "version": "0'0",
>>                 "using_gmt": "1"
>>             },
>>             "history": []
>>         }
>>     },
>>     "peer_info": [
>>         {
>>             "peer": "26",
>>             "pgid": "1.11d",
>>             "last_update": "11755'60561210",
>>             "last_complete": "11755'60561210",
>>             "log_tail": "11649'58446601",
>>             "last_user_version": 58449647,
>>             "last_backfill": "MAX",
>>             "purged_snaps": "[1~33,36~22]",
>>             "history": {
>>                 "epoch_created": 31,
>>                 "last_epoch_started": 11681,
>>                 "last_epoch_clean": 11681,
>>                 "last_epoch_split": 0,
>>                 "same_up_since": 11679,
>>                 "same_interval_since": 11680,
>>                 "same_primary_since": 11510,
>>                 "last_scrub": "449'16483",
>>                 "last_scrub_stamp": "2016-09-14 15:25:14.228231",
>>                 "last_deep_scrub": "448'16277",
>>                 "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
>>                 "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231"
>>             },
>>             "stats": {
>>                 "version": "11678'58449646",
>>                 "reported_seq": "48950066",
>>                 "reported_epoch": "11678",
>>                 "state": "active+clean",
>>                 "last_fresh": "2018-11-09 10:54:02.263168",
>>                 "last_change": "2018-11-09 08:01:12.116827",
>>                 "last_active": "2018-11-09 10:54:02.263168",
>>                 "last_peered": "2018-11-09 10:54:02.263168",
>>                 "last_clean": "2018-11-09 10:54:02.263168",
>>                 "last_became_active": "0.000000",
>>                 "last_became_peered": "0.000000",
>>                 "last_unstale": "2018-11-09 10:54:02.263168",
>>                 "last_undegraded": "2018-11-09 10:54:02.263168",
>>                 "last_fullsized": "2018-11-09 10:54:02.263168",
>>                 "mapping_epoch": 11679,
>>                 "log_start": "11649'58446601",
>>                 "ondisk_log_start": "11649'58446601",
>>                 "created": 31,
>>                 "last_epoch_clean": 11610,
>>                 "parent": "0.0",
>>                 "parent_split_bits": 0,
>>                 "last_scrub": "449'16483",
>>                 "last_scrub_stamp": "2016-09-14 15:25:14.228231",
>>                 "last_deep_scrub": "448'16277",
>>                 "last_deep_scrub_stamp": "2016-09-13 06:11:45.633007",
>>                 "last_clean_scrub_stamp": "2016-09-14 15:25:14.228231",
>>                 "log_size": 3045,
>>                 "ondisk_log_size": 3045,
>>                 "stats_invalid": "0",
>>                 "stat_sum": {
>>                     "num_bytes": 18153595392,
>>                     "num_objects": 4344,
>>                     "num_object_clones": 0,
>>                     "num_object_copies": 8688,
>>                     "num_objects_missing_on_primary": 0,
>>                     "num_objects_degraded": 0,
>>                     "num_objects_misplaced": 0,
>>                     "num_objects_unfound": 0,
>>                     "num_objects_dirty": 4344,
>>                     "num_whiteouts": 0,
>>                     "num_read": 26674601,
>>                     "num_read_kb": 1767105243,
>>                     "num_write": 116892449,
>>                     "num_write_kb": 2073693377,
>>                     "num_scrub_errors": 0,
>>                     "num_shallow_scrub_errors": 0,
>>                     "num_deep_scrub_errors": 0,
>>                     "num_objects_recovered": 65218,
>>                     "num_bytes_recovered": 271765903872,
>>                     "num_keys_recovered": 0,
>>                     "num_objects_omap": 0,
>>                     "num_objects_hit_set_archive": 0,
>>                     "num_bytes_hit_set_archive": 0
>>                 },
>>                 "up": [
>>                     6
>>                 ],
>>                 "acting": [
>>                     6,
>>                     26
>>                 ],
>>                 "blocked_by": [],
>>                 "up_primary": 6,
>>                 "acting_primary": 6
>>             },
>>             "empty": 0,
>>             "dne": 0,
>>             "incomplete": 0,
>>             "last_epoch_started": 11681,
>>             "hit_set_history": {
>>                 "current_last_update": "0'0",
>>                 "current_last_stamp": "0.000000",
>>                 "current_info": {
>>                     "begin": "0.000000",
>>                     "end": "0.000000",
>>                     "version": "0'0",
>>                     "using_gmt": "1"
>>                 },
>>                 "history": []
>>             }
>>         }
>>     ],
>>     "recovery_state": [
>>         {
>>             "name": "Started\/Primary\/Active",
>>             "enter_time": "2018-11-09 10:54:06.825830",
>>             "might_have_unfound": [],
>>             "recovery_progress": {
>>                 "backfill_targets": [],
>>                 "waiting_on_backfill": [],
>>                 "last_backfill_started": "-1\/0\/\/0",
>>                 "backfill_info": {
>>                     "begin": "-1\/0\/\/0",
>>                     "end": "-1\/0\/\/0",
>>                     "objects": []
>>                 },
>>                 "peer_backfill_info": [],
>>                 "backfills_in_flight": [],
>>                 "recovering": [],
>>                 "pg_backend": {
>>                     "pull_from_peer": [],
>>                     "pushing": []
>>                 }
>>             },
>>             "scrub": {
>>                 "scrubber.epoch_start": "0",
>>                 "scrubber.active": 0,
>>                 "scrubber.waiting_on": 0,
>>                 "scrubber.waiting_on_whom": []
>>             }
>>         },
>>         {
>>             "name": "Started",
>>             "enter_time": "2018-11-09 10:54:05.789621"
>>         }
>>     ],
>>     "agent_state": {}
>> }
>>
>> *Ceph status*
>>
>>     health HEALTH_WARN
>>             1 pgs stuck unclean
>>             recovery 3359/6118678 objects misplaced (0.055%)
>>             noout,nodeep-scrub flag(s) set
>>      monmap e3: 3 mons at
>> {0=192.168.1.1:6789/0,1=192.168.1.2:6789/0,2=192.168.1.3:6789/0}
>>             election epoch 4882, quorum 0,1,2 0,1,2
>>      osdmap e11755: 27 osds: 27 up, 27 in; 1 remapped pgs
>>             flags noout,nodeep-scrub
>>       pgmap v62734988: 1024 pgs, 2 pools, 10183 GB data, 2557 kobjects
>>             23768 GB used, 48720 GB / 72488 GB avail
>>             3359/6118678 objects misplaced (0.055%)
>>                 1023 active+clean
>>                    1 active+remapped
>>   client io 141 kB/s rd, 14068 kB/s wr, 925 op/s
>>
>> 2018-12-03 12:58:52.109913 mon.0 [INF] pgmap v62734987: 1024 pgs: 1
>> active+remapped, 1023 active+clean; 10183 GB data, 23768 GB used, 48720
>> GB / 72488 GB avail; 8325 kB/s rd, 16182 kB/s wr, 1704 op/s;
>> 3359/6118678 objects misplaced (0.055%)
>>
>> *OSD tree*
>>
>> ID  WEIGHT   REWEIGHT SIZE   USE    AVAIL  %USE  VAR  TYPE NAME        
>>      
>> -11 37.19995        - 37204G 10257G 26946G 27.57 0.84 root hdd          
>>    
>> -12  9.29999        -  9301G  2531G  6769G 27.22 0.83     host hdd-node1
>>  18  4.64999  1.00000  4650G  1226G  3424G 26.37 0.80         osd.18    
>>    
>>  19  4.64999  1.00000  4650G  1305G  3345G 28.06 0.86         osd.19    
>>    
>> -13  9.29999        -  9301G  2665G  6635G 28.66 0.87     host hdd-node2
>>  20  4.64999  1.00000  4650G  1361G  3289G 29.27 0.89         osd.20    
>>    
>>  21  4.64999  1.00000  4650G  1304G  3346G 28.05 0.86         osd.21    
>>    
>> -14  9.29999        -  9301G  2628G  6672G 28.26 0.86     host hdd-node3
>>  22  4.64999  1.00000  4650G  1396G  3254G 30.02 0.92         osd.22    
>>    
>>  23  4.64999  1.00000  4650G  1232G  3418G 26.50 0.81         osd.23    
>>    
>> -15  9.29999        -  9301G  2431G  6869G 26.15 0.80     host hdd-node4
>>  24  4.64999  1.00000  4650G  1218G  3432G 26.20 0.80         osd.24    
>>    
>>  25  4.64999  1.00000  4650G  1213G  3436G 26.09 0.80         osd.25    
>>    
>>  -1 35.14995        - 35284G 13512G 21771G 38.30 1.17 root default      
>>    
>>  -2  9.25000        -  9285G  3431G  5853G 36.96 1.13     host node1    
>>   0  1.84999  1.00000  1857G   765G  1091G 41.24 1.26         osd.0    
>>      
>>   1  1.84999  1.00000  1857G   633G  1224G 34.09 1.04         osd.1    
>>      
>>   6  1.84999  1.00000  1857G   777G  1079G 41.88 1.28         osd.6    
>>      
>>   7  1.84999  0.89999  1857G   752G  1104G 40.54 1.24         osd.7    
>>      
>>   8  1.84999  1.00000  1857G   502G  1354G 27.06 0.83         osd.8    
>>      
>>  -3  9.24995        -  9285G  3562G  5722G 38.37 1.17     host node2    
>>   2  1.84999  1.00000  1857G   766G  1090G 41.27 1.26         osd.2    
>>      
>>   3  1.84999  0.70000  1857G   674G  1182G 36.33 1.11         osd.3    
>>      
>>   9  1.84999  1.00000  1857G   580G  1276G 31.28 0.95         osd.9    
>>      
>>  10  1.84999  1.00000  1857G   814G  1042G 43.88 1.34         osd.10    
>>    
>>  11  1.84999  1.00000  1857G   725G  1131G 39.07 1.19         osd.11    
>>    
>>  -4  9.25000        -  9285G  3561G  5724G 38.35 1.17     host node3    
>>   4  1.84999  1.00000  1857G   684G  1172G 36.88 1.12         osd.4    
>>      
>>   5  1.84999  1.00000  1857G   633G  1223G 34.11 1.04         osd.5    
>>      
>>  12  1.84999  1.00000  1857G   696G  1160G 37.49 1.14         osd.12    
>>    
>>  13  1.84999  0.70000  1857G   741G  1116G 39.90 1.22         osd.13    
>>    
>>  14  1.84999  0.89999  1857G   805G  1051G 43.37 1.32         osd.14    
>>    
>>  -5  7.39999        -  7428G  2957G  4470G 39.81 1.21     host node4    
>>  15  1.84999  0.79999  1857G   742G  1115G 39.96 1.22         osd.15    
>>    
>>  16  1.84999  1.00000  1857G   634G  1222G 34.15 1.04         osd.16    
>>    
>>  17  1.84999  0.89999  1857G   803G  1053G 43.26 1.32         osd.17    
>>    
>>  26  1.84999  0.81000  1857G   777G  1079G 41.89 1.28         osd.26    
>>    
>>                 TOTAL 72488G 23770G 48718G 32.79                        
>>    
>> MIN/MAX VAR: 0.80/1.34  STDDEV: 6.56
>>
>> *PG dump*
>>
>> version 62735224
>> stamp 2018-12-03 13:02:52.799643
>> last_osdmap_epoch 11755
>> last_pg_scan 9537
>> full_ratio 0.95
>> nearfull_ratio 0.85
>> pg_stat objects mip degr misp unf bytes log disklog state state_stamp v
>> reported up up_primary acting acting_primary last_scrub scrub_stamp
>> last_deep_scrub deep_scrub_stamp
>>
>> ///active+clean ones removed///
>>
>> 1.11d 3359 0 0 3359 0 14031434258 3034 3034 active+remapped 2018-11-09
>> 10:54:06.861873 11755'60561357 11755:50924695 [6] 6 [6,26] 6 449'16483
>> 2016-09-14 15:25:14.228231 448'16277 2016-09-13 06:11:45.633007
>>
>> ///active+clean ones removed///
>>
>> pool 1 1738101 0 0 3359 0 7253679601802 1562466 1562466
>> pool 2 881071 0 0 0 0 3682217717410 1561924 1561924
>>  sum 2619172 0 0 3359 0 10935897319212 3124390 3124390
>> osdstat kbused kbavail kb hb in hb out
>> 0 803034868 1144251736 1947286604 [1,2,3,4,5,10,12,13,14,15,16,26] []
>> 1 663754736 1283531868 1947286604 [0,2,3,5,10,12,13,15,16,17,26] []
>> 2 803619260 1143667344 1947286604 [0,1,3,4,5,6,7,8,13,14,15,16,17,26] []
>> 3 707438640 1239847964 1947286604 [0,1,2,4,5,6,7,8,12,13,14,15,17] []
>> 4 718194072 1229092532 1947286604 [2,3,5,6,8,10,11,15,16,17,26] []
>> 5 664279112 1283007492 1947286604 [0,1,2,3,4,6,8,10,11,15,17,26] []
>> 6 815455088 1131831516 1947286604 [2,4,5,7,9,10,11,12,13,14,15,16,17,26] []
>> 7 789396940 1157889664 1947286604 [2,4,6,8,9,10,11,12,13,14,15,16,17,26] []
>> 8 526871252 1420415352 1947286604 [2,3,4,5,7,9,10,11,12,13,15,16,17] []
>> 9 609147992 1338138612 1947286604 [0,1,4,7,8,10,13,14,15,16,17,26] []
>> 10 854451916 1092834688 1947286604
>> [0,1,4,5,7,8,9,11,12,13,14,15,16,17,26] []
>> 11 760893328 1186393276 1947286604 [1,4,5,6,7,8,10,12,13,14,15,16,17,26] []
>> 12 730109256 1217177348 1947286604 [0,6,7,8,9,10,11,13,15,16,17,26] []
>> 13 777029008 1170257596 1947286604 [0,1,2,3,6,7,8,9,10,11,12,14,15,16] []
>> 14 844469760 1102816844 1947286604 [0,1,2,3,6,7,9,10,11,13,15,16,17,26] []
>> 15 778122444 1169164160 1947286604 [0,2,3,4,6,9,10,11,14,16] []
>> 16 664960388 1282326216 1947286604 [1,2,3,4,5,6,7,8,10,11,12,13,15,17] []
>> 17 842428012 1104858592 1947286604 [0,1,2,3,4,5,8,9,10,11,12,13,14,16,18] []
>> 18 1285869748 3590537232 4876406980 [0,1,17,20,21,22,23,24,25,26] []
>> 19 1368764192 3507642788 4876406980 [0,1,18,20,21,22,23,24,25,26] []
>> 20 1427417120 3448989860 4876406980 [4,17,18,19,21,22,23,24,25,26] []
>> 21 1367928664 3508478316 4876406980 [4,5,18,19,20,22,23,24,25,26] []
>> 22 1464361956 3412045024 4876406980 [4,5,18,19,20,21,23,24,25,26] []
>> 23 1292415092 3583991888 4876406980 [4,5,18,19,20,21,22,24,25,26] []
>> 24 1277731204 3598675776 4876406980 [4,5,18,19,20,21,22,23,25,26] []
>> 25 1272703828 3603703152 4876406980 [4,5,18,19,20,21,22,23,24,26] []
>> 26 815682292 1131604312 1947286604 [0,1,2,3,4,5,6,7,8,10,11,12,13,25] []
>>
>> *Ceph health detail*
>>
>> HEALTH_WARN 1 pgs stuck unclean; recovery 3359/6120420 objects misplaced
>> (0.055%); noout,nodeep-scrub flag(s) set
>> pg 1.11d is stuck unclean for 2081576.511195, current state
>> active+remapped, last acting [6,26]
>> recovery 3359/6120420 objects misplaced (0.055%)
>> noout,nodeep-scrub flag(s) set
>>
>> *Ceph version*
>>
>> ceph version 0.94.10 (b1e0532418e4631af01acbc0cedd426f1905f4af)
>>
>> Regards,
>> Nasos Pan
>>
>> _______________________________________________
>> ceph-users mailing list
>> ceph-***@lists.ceph.com
>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>
Loading...