--
-- Copyright 2015-2019 Intel Corporation.
-- This software and the related documents are Intel copyrighted materials, and your use of them 
-- is governed by the express license under which they were provided to you ("License"). Unless the 
-- License provides otherwise, you may not use, modify, copy, publish, distribute, disclose or 
-- transmit this software or the related documents without Intel's prior written permission.
-- 
-- This software and the related documents are provided as is, with no express or implied warranties, 
-- other than those that are expressly stated in the License.
-- 
--


-- TASK:
-- Customer segmentation for return analysis: Customers are separated
-- along the following dimensions: return frequency, return order ratio (total
-- number of orders partially or fully returned versus the total number of orders),
-- return item ratio (total number of items returned versus the number of items
-- purchased), return amount ration (total monetary amount of items returned versus
-- the amount purchased), return order ratio. Consider the store returns during
-- a given year for the computation.

-- IMPLEMENTATION NOTICE:
-- hive provides the input for the clustering program
-- The input format for the clustering is:
--   user surrogate key, 
--   order ratio (number of returns / number of orders), 
--   item ratio (number of returned items / number of ordered items), 
--   money ratio (returned money / payed money), 
--   number of returns


-- Resources


-- This query requires parallel order by for fast and deterministic global ordering of final result
set hive.optimize.sampling.orderby=${hiveconf:bigbench.hive.optimize.sampling.orderby};
set hive.optimize.sampling.orderby.number=${hiveconf:bigbench.hive.optimize.sampling.orderby.number};
set hive.optimize.sampling.orderby.percent=${hiveconf:bigbench.hive.optimize.sampling.orderby.percent};
--debug print
set hive.optimize.sampling.orderby;
set hive.optimize.sampling.orderby.number;
set hive.optimize.sampling.orderby.percent;

--ML-algorithms expect double values as input for their Vectors. 
DROP TABLE IF EXISTS ${hiveconf:TEMP_TABLE};
CREATE TABLE ${hiveconf:TEMP_TABLE} (
   user_sk       BIGINT, --used as "label", all following values are used as Vector for ML-algorithm
   orderRatio    double,
   itemsRatio    double,
   monetaryRatio double,
   frequency     double
);


-- there are two possible version. Both are valid points of view
-- version ONE where customers without returns are also part of the analysis
INSERT INTO TABLE ${hiveconf:TEMP_TABLE} 
SELECT
  ss_customer_sk AS user_sk,
  round(CASE WHEN ((returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (returns_count / orders_count) END, 7) AS orderRatio,
  round(CASE WHEN ((returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (returns_items / orders_items) END, 7) AS itemsRatio,
  round(CASE WHEN ((returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (returns_money / orders_money) END, 7) AS monetaryRatio,
  round(CASE WHEN ( returns_count IS NULL                                                                        ) THEN 0.0 ELSE  returns_count                 END, 0) AS frequency
FROM
  (
    SELECT
      ss_customer_sk,
      -- return order ratio
      COUNT(distinct(ss_ticket_number)) AS orders_count,
      -- return ss_item_sk ratio
      COUNT(ss_item_sk) AS orders_items,
      -- return monetary amount ratio
      SUM( ss_net_paid ) AS orders_money
    FROM store_sales s
    GROUP BY ss_customer_sk
  ) orders
  LEFT OUTER JOIN
  (
    SELECT
      sr_customer_sk,
      -- return order ratio
      count(distinct(sr_ticket_number)) as returns_count,
      -- return ss_item_sk ratio
      COUNT(sr_item_sk) as returns_items,
      -- return monetary amount ratio
      SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk
  ) returned ON ss_customer_sk=sr_customer_sk
ORDER BY user_sk
;


--version TWO where customers are filtered out that don't have any returns
-- INSERT INTO TABLE ${hiveconf:TEMP_TABLE}
-- SELECT
-- ss_customer_sk AS user_sk,
-- IF ( (returns_count IS NULL) OR (orders_count IS NULL) OR ((orders_count / returns_count) IS NULL) , 0 , (orders_count / returns_count) ) AS orderRatio,
-- IF ( (returns_items IS NULL) OR (orders_items IS NULL) OR ((orders_items / returns_items) IS NULL) , 0 , (orders_items / returns_items) ) AS itemsRatio,
-- IF ( (returns_money IS NULL) OR (orders_money IS NULL) OR ((orders_money / returns_money) IS NULL) , 0 , (orders_money / returns_money) ) AS monetaryRatio,
-- IF (  returns_count IS NULL                                                                        , 0 ,  returns_count                 ) AS frequency
-- FROM
-- (
-- SELECT
-- ss_customer_sk,
-- -- return order ratio
-- count(distinct(ss_ticket_number)) as orders_count,
-- -- return ss_item_sk ratio
-- COUNT(ss_item_sk) as orders_items,
-- -- return monetary amount ratio
-- SUM( ss_net_paid ) AS orders_money
-- FROM store_sales s
-- GROUP BY ss_customer_sk
-- ) orders,
-- (
-- SELECT
-- sr_customer_sk,
-- -- return order ratio
-- count(distinct(sr_ticket_number)) as returns_count,
-- -- return ss_item_sk ratio
-- COUNT(sr_item_sk) as returns_items,
-- -- return monetary amount ratio
-- SUM( sr_return_amt ) AS returns_money
-- FROM store_returns
-- GROUP BY sr_customer_sk
-- ) returned
-- WHERE ss_customer_sk=sr_customer_sk
-- -- HAVING frequency > 1  -- this would filter out all customers that do not have returns
-- CLUSTER BY user_sk
-- ;
